aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Behlendorf <[email protected]>2018-05-29 14:57:07 -0700
committerBrian Behlendorf <[email protected]>2018-05-29 14:57:55 -0700
commit1272941f49321bcb7b26025670720c98785427ee (patch)
tree068d78ae84d1fdaf25c392f48799789e7f63d1dc
parent3e5300e0ed3c4b49e3b0dab7daded1e3bfaaded7 (diff)
parenta91258913fb597db7f409f3534512cf2249bceb6 (diff)
Merge branch 'zfsonlinux/merge-spl'
Merge a minimal version of the zfsonlinux/spl repository in to the zfsonlinux/zfs repository. Care was taken to prevent file conflicts when merging and to preserve the spl repository history. The spl kernel module remains under the GPLv2 license as documented by the additional THIRDPARTYLICENSE.gplv2 file. Signed-off-by: Brian Behlendorf <[email protected]>
-rw-r--r--config/kernel-ctl-table-name.m418
-rw-r--r--config/kernel-fallocate-pax.m419
-rw-r--r--config/kernel-group-info.m421
-rw-r--r--config/kernel-inode-lock.m423
-rw-r--r--config/kernel-kmem-cache.m472
-rw-r--r--config/kernel-kmem.m458
-rw-r--r--config/kernel-kuidgid.m428
-rw-r--r--config/kernel-pde-data.m417
-rw-r--r--config/kernel-rw.m457
-rw-r--r--config/kernel-rwsem.m475
-rw-r--r--config/kernel-sched.m456
-rw-r--r--config/kernel-set-fs-pwd.m439
-rw-r--r--config/kernel-shrinker.m4125
-rw-r--r--config/kernel-spinlock.m424
-rw-r--r--config/kernel-timer.m432
-rw-r--r--config/kernel-trim-unused-symbols.m419
-rw-r--r--config/kernel-urange-sleep.m421
-rw-r--r--config/kernel-vfs-fsync.m417
-rw-r--r--config/kernel-vfs-getattr.m462
-rw-r--r--config/kernel-wait.m476
-rw-r--r--config/kernel-zlib.m463
-rw-r--r--include/spl/rpc/xdr.h156
-rw-r--r--include/spl/sys/acl.h119
-rw-r--r--include/spl/sys/atomic.h79
-rw-r--r--include/spl/sys/byteorder.h78
-rw-r--r--include/spl/sys/callb.h54
-rw-r--r--include/spl/sys/callo.h52
-rw-r--r--include/spl/sys/cmn_err.h42
-rw-r--r--include/spl/sys/condvar.h80
-rw-r--r--include/spl/sys/console.h44
-rw-r--r--include/spl/sys/cred.h75
-rw-r--r--include/spl/sys/ctype.h30
-rw-r--r--include/spl/sys/debug.h131
-rw-r--r--include/spl/sys/disp.h34
-rw-r--r--include/spl/sys/dkio.h40
-rw-r--r--include/spl/sys/dkioc_free_util.h58
-rw-r--r--include/spl/sys/fcntl.h37
-rw-r--r--include/spl/sys/file.h52
-rw-r--r--include/spl/sys/inttypes.h28
-rw-r--r--include/spl/sys/isa_defs.h229
-rw-r--r--include/spl/sys/kmem.h185
-rw-r--r--include/spl/sys/kmem_cache.h240
-rw-r--r--include/spl/sys/kobj.h42
-rw-r--r--include/spl/sys/kstat.h208
-rw-r--r--include/spl/sys/list.h208
-rw-r--r--include/spl/sys/mode.h32
-rw-r--r--include/spl/sys/mutex.h184
-rw-r--r--include/spl/sys/param.h36
-rw-r--r--include/spl/sys/proc.h35
-rw-r--r--include/spl/sys/processor.h32
-rw-r--r--include/spl/sys/random.h40
-rw-r--r--include/spl/sys/rwlock.h273
-rw-r--r--include/spl/sys/shrinker.h209
-rw-r--r--include/spl/sys/sid.h61
-rw-r--r--include/spl/sys/signal.h55
-rw-r--r--include/spl/sys/stat.h30
-rw-r--r--include/spl/sys/strings.h31
-rw-r--r--include/spl/sys/sunddi.h58
-rw-r--r--include/spl/sys/sysmacros.h228
-rw-r--r--include/spl/sys/systeminfo.h36
-rw-r--r--include/spl/sys/taskq.h163
-rw-r--r--include/spl/sys/thread.h69
-rw-r--r--include/spl/sys/time.h82
-rw-r--r--include/spl/sys/timer.h75
-rw-r--r--include/spl/sys/tsd.h46
-rw-r--r--include/spl/sys/types.h70
-rw-r--r--include/spl/sys/types32.h35
-rw-r--r--include/spl/sys/uio.h106
-rw-r--r--include/spl/sys/user.h42
-rw-r--r--include/spl/sys/vfs.h51
-rw-r--r--include/spl/sys/vmem.h109
-rw-r--r--include/spl/sys/vmsystm.h84
-rw-r--r--include/spl/sys/vnode.h204
-rw-r--r--include/spl/sys/wait.h55
-rw-r--r--include/spl/sys/zmod.h78
-rw-r--r--include/spl/sys/zone.h36
-rw-r--r--man/man5/spl-module-parameters.5357
-rw-r--r--module/spl/THIRDPARTYLICENSE.gplv2339
-rw-r--r--module/spl/THIRDPARTYLICENSE.gplv2.descrip1
-rw-r--r--module/spl/spl-atomic.c36
-rw-r--r--module/spl/spl-condvar.c410
-rw-r--r--module/spl/spl-cred.c200
-rw-r--r--module/spl/spl-err.c133
-rw-r--r--module/spl/spl-generic.c775
-rw-r--r--module/spl/spl-kmem-cache.c1769
-rw-r--r--module/spl/spl-kmem.c567
-rw-r--r--module/spl/spl-kobj.c86
-rw-r--r--module/spl/spl-kstat.c733
-rw-r--r--module/spl/spl-mutex.c30
-rw-r--r--module/spl/spl-proc.c782
-rw-r--r--module/spl/spl-rwlock.c114
-rw-r--r--module/spl/spl-taskq.c1305
-rw-r--r--module/spl/spl-thread.c160
-rw-r--r--module/spl/spl-tsd.c720
-rw-r--r--module/spl/spl-vmem.c135
-rw-r--r--module/spl/spl-vnode.c779
-rw-r--r--module/spl/spl-xdr.c515
-rw-r--r--module/spl/spl-zlib.c217
98 files changed, 16031 insertions, 0 deletions
diff --git a/config/kernel-ctl-table-name.m4 b/config/kernel-ctl-table-name.m4
new file mode 100644
index 000000000..8dd2e77cb
--- /dev/null
+++ b/config/kernel-ctl-table-name.m4
@@ -0,0 +1,18 @@
+dnl #
+dnl # 2.6.33 API change,
+dnl # Removed .ctl_name from struct ctl_table.
+dnl #
+AC_DEFUN([SPL_AC_CTL_NAME], [
+ AC_MSG_CHECKING([whether struct ctl_table has ctl_name])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sysctl.h>
+ ],[
+ struct ctl_table ctl __attribute__ ((unused));
+ ctl.ctl_name = 0;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_CTL_NAME, 1, [struct ctl_table has ctl_name])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-fallocate-pax.m4 b/config/kernel-fallocate-pax.m4
new file mode 100644
index 000000000..ac75a4c8e
--- /dev/null
+++ b/config/kernel-fallocate-pax.m4
@@ -0,0 +1,19 @@
+dnl #
+dnl # PaX Linux 2.6.38 - 3.x API
+dnl #
+AC_DEFUN([SPL_AC_PAX_KERNEL_FILE_FALLOCATE], [
+ AC_MSG_CHECKING([whether fops->fallocate() exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ long (*fallocate) (struct file *, int, loff_t, loff_t) = NULL;
+ struct file_operations_no_const fops __attribute__ ((unused)) = {
+ .fallocate = fallocate,
+ };
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-group-info.m4 b/config/kernel-group-info.m4
new file mode 100644
index 000000000..4db2bba5c
--- /dev/null
+++ b/config/kernel-group-info.m4
@@ -0,0 +1,21 @@
+dnl #
+dnl # 4.9 API change
+dnl # group_info changed from 2d array via >blocks to 1d array via ->gid
+dnl #
+AC_DEFUN([SPL_AC_GROUP_INFO_GID], [
+ AC_MSG_CHECKING([whether group_info->gid exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/cred.h>
+ ],[
+ struct group_info *gi = groups_alloc(1);
+ gi->gid[0] = KGIDT_INIT(0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_GROUP_INFO_GID, 1, [group_info->gid exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-inode-lock.m4 b/config/kernel-inode-lock.m4
new file mode 100644
index 000000000..2cc06a5ec
--- /dev/null
+++ b/config/kernel-inode-lock.m4
@@ -0,0 +1,23 @@
+dnl #
+dnl # 4.7 API change
+dnl # i_mutex is changed to i_rwsem. Instead of directly using
+dnl # i_mutex/i_rwsem, we should use inode_lock() and inode_lock_shared()
+dnl # We test inode_lock_shared because inode_lock is introduced earlier.
+dnl #
+AC_DEFUN([SPL_AC_INODE_LOCK], [
+ AC_MSG_CHECKING([whether inode_lock_shared() exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ struct inode *inode = NULL;
+ inode_lock_shared(inode);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_INODE_LOCK_SHARED, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-kmem-cache.m4 b/config/kernel-kmem-cache.m4
new file mode 100644
index 000000000..50a7fdb4b
--- /dev/null
+++ b/config/kernel-kmem-cache.m4
@@ -0,0 +1,72 @@
+dnl #
+dnl # 2.6.35 API change,
+dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are
+dnl # private allocation flags which are applied when allocating a new slab
+dnl # in kmem_getpages(). Unfortunately there is no public API for setting
+dnl # non-default flags.
+dnl #
+AC_DEFUN([SPL_AC_KMEM_CACHE_ALLOCFLAGS], [
+ AC_MSG_CHECKING([whether struct kmem_cache has allocflags])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/slab.h>
+ ],[
+ struct kmem_cache cachep __attribute__ ((unused));
+ cachep.allocflags = GFP_KERNEL;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1,
+ [struct kmem_cache has allocflags])
+ ],[
+ AC_MSG_RESULT(no)
+
+ AC_MSG_CHECKING([whether struct kmem_cache has gfpflags])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/slab.h>
+ ],[
+ struct kmem_cache cachep __attribute__ ((unused));
+ cachep.gfpflags = GFP_KERNEL;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1,
+ [struct kmem_cache has gfpflags])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ ])
+])
+
+dnl #
+dnl # grsecurity API change,
+dnl # kmem_cache_create() with SLAB_USERCOPY flag replaced by
+dnl # kmem_cache_create_usercopy().
+dnl #
+AC_DEFUN([SPL_AC_KMEM_CACHE_CREATE_USERCOPY], [
+ AC_MSG_CHECKING([whether kmem_cache_create_usercopy() exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/slab.h>
+ static void ctor(void *foo)
+ {
+ // fake ctor
+ }
+ ],[
+ struct kmem_cache *skc_linux_cache;
+ const char *name = "test";
+ size_t size = 4096;
+ size_t align = 8;
+ unsigned long flags = 0;
+ size_t useroffset = 0;
+ size_t usersize = size - useroffset;
+
+ skc_linux_cache = kmem_cache_create_usercopy(
+ name, size, align, flags, useroffset, usersize, ctor);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KMEM_CACHE_CREATE_USERCOPY, 1,
+ [kmem_cache_create_usercopy() exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-kmem.m4 b/config/kernel-kmem.m4
new file mode 100644
index 000000000..cc055e530
--- /dev/null
+++ b/config/kernel-kmem.m4
@@ -0,0 +1,58 @@
+dnl #
+dnl # Enabled by default it provides a minimal level of memory tracking.
+dnl # A total count of bytes allocated is kept for each alloc and free.
+dnl # Then at module unload time a report to the console will be printed
+dnl # if memory was leaked.
+dnl #
+AC_DEFUN([SPL_AC_DEBUG_KMEM], [
+ AC_ARG_ENABLE([debug-kmem],
+ [AS_HELP_STRING([--enable-debug-kmem],
+ [Enable basic kmem accounting @<:@default=no@:>@])],
+ [],
+ [enable_debug_kmem=no])
+
+ AS_IF([test "x$enable_debug_kmem" = xyes],
+ [
+ KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM"
+ DEBUG_KMEM="_with_debug_kmem"
+ AC_DEFINE([DEBUG_KMEM], [1],
+ [Define to 1 to enable basic kmem accounting])
+ ], [
+ DEBUG_KMEM="_without_debug_kmem"
+ ])
+
+ AC_SUBST(DEBUG_KMEM)
+ AC_MSG_CHECKING([whether basic kmem accounting is enabled])
+ AC_MSG_RESULT([$enable_debug_kmem])
+])
+
+dnl #
+dnl # Disabled by default it provides detailed memory tracking. This
+dnl # feature also requires --enable-debug-kmem to be set. When enabled
+dnl # not only will total bytes be tracked but also the location of every
+dnl # alloc and free. When the SPL module is unloaded a list of all leaked
+dnl # addresses and where they were allocated will be dumped to the console.
+dnl # Enabling this feature has a significant impact on performance but it
+dnl # makes finding memory leaks pretty straight forward.
+dnl #
+AC_DEFUN([SPL_AC_DEBUG_KMEM_TRACKING], [
+ AC_ARG_ENABLE([debug-kmem-tracking],
+ [AS_HELP_STRING([--enable-debug-kmem-tracking],
+ [Enable detailed kmem tracking @<:@default=no@:>@])],
+ [],
+ [enable_debug_kmem_tracking=no])
+
+ AS_IF([test "x$enable_debug_kmem_tracking" = xyes],
+ [
+ KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM_TRACKING"
+ DEBUG_KMEM_TRACKING="_with_debug_kmem_tracking"
+ AC_DEFINE([DEBUG_KMEM_TRACKING], [1],
+ [Define to 1 to enable detailed kmem tracking])
+ ], [
+ DEBUG_KMEM_TRACKING="_without_debug_kmem_tracking"
+ ])
+
+ AC_SUBST(DEBUG_KMEM_TRACKING)
+ AC_MSG_CHECKING([whether detailed kmem tracking is enabled])
+ AC_MSG_RESULT([$enable_debug_kmem_tracking])
+])
diff --git a/config/kernel-kuidgid.m4 b/config/kernel-kuidgid.m4
new file mode 100644
index 000000000..47d193783
--- /dev/null
+++ b/config/kernel-kuidgid.m4
@@ -0,0 +1,28 @@
+dnl #
+dnl # User namespaces, use kuid_t in place of uid_t
+dnl # where available. Not strictly a user namespaces thing
+dnl # but it should prevent surprises
+dnl #
+AC_DEFUN([SPL_AC_KUIDGID_T], [
+ AC_MSG_CHECKING([whether kuid_t/kgid_t is available])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/uidgid.h>
+ ], [
+ kuid_t userid = KUIDT_INIT(0);
+ kgid_t groupid = KGIDT_INIT(0);
+ ],[
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/uidgid.h>
+ ], [
+ kuid_t userid = 0;
+ kgid_t groupid = 0;
+ ],[
+ AC_MSG_RESULT(yes; optional)
+ ],[
+ AC_MSG_RESULT(yes; mandatory)
+ AC_DEFINE(HAVE_KUIDGID_T, 1, [kuid_t/kgid_t in use])
+ ])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-pde-data.m4 b/config/kernel-pde-data.m4
new file mode 100644
index 000000000..6aa5765c3
--- /dev/null
+++ b/config/kernel-pde-data.m4
@@ -0,0 +1,17 @@
+dnl #
+dnl # 3.10 API change,
+dnl # PDE is replaced by PDE_DATA
+dnl #
+AC_DEFUN([SPL_AC_PDE_DATA], [
+ AC_MSG_CHECKING([whether PDE_DATA() is available])
+ SPL_LINUX_TRY_COMPILE_SYMBOL([
+ #include <linux/proc_fs.h>
+ ], [
+ PDE_DATA(NULL);
+ ], [PDE_DATA], [], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_PDE_DATA, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-rw.m4 b/config/kernel-rw.m4
new file mode 100644
index 000000000..23c14b70f
--- /dev/null
+++ b/config/kernel-rw.m4
@@ -0,0 +1,57 @@
+dnl #
+dnl # 4.14 API change
+dnl # kernel_write() which was introduced in 3.9 was updated to take
+dnl # the offset as a pointer which is needed by vn_rdwr().
+dnl #
+AC_DEFUN([SPL_AC_KERNEL_WRITE], [
+ AC_MSG_CHECKING([whether kernel_write() takes loff_t pointer])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ struct file *file = NULL;
+ const void *buf = NULL;
+ size_t count = 0;
+ loff_t *pos = NULL;
+ ssize_t ret;
+
+ ret = kernel_write(file, buf, count, pos);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KERNEL_WRITE_PPOS, 1,
+ [kernel_write() take loff_t pointer])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 4.14 API change
+dnl # kernel_read() which has existed for forever was updated to take
+dnl # the offset as a pointer which is needed by vn_rdwr().
+dnl #
+AC_DEFUN([SPL_AC_KERNEL_READ], [
+ AC_MSG_CHECKING([whether kernel_read() takes loff_t pointer])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ struct file *file = NULL;
+ void *buf = NULL;
+ size_t count = 0;
+ loff_t *pos = NULL;
+ ssize_t ret;
+
+ ret = kernel_read(file, buf, count, pos);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KERNEL_READ_PPOS, 1,
+ [kernel_read() take loff_t pointer])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-rwsem.m4 b/config/kernel-rwsem.m4
new file mode 100644
index 000000000..aee20ae90
--- /dev/null
+++ b/config/kernel-rwsem.m4
@@ -0,0 +1,75 @@
+dnl #
+dnl # 3.1 API Change
+dnl #
+dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to
+dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1.
+dnl #
+AC_DEFUN([SPL_AC_RWSEM_SPINLOCK_IS_RAW], [
+ AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/rwsem.h>
+ ],[
+ struct rw_semaphore dummy_semaphore __attribute__ ((unused));
+ raw_spinlock_t dummy_lock __attribute__ ((unused)) =
+ __RAW_SPIN_LOCK_INITIALIZER(dummy_lock);
+ dummy_semaphore.wait_lock = dummy_lock;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(RWSEM_SPINLOCK_IS_RAW, 1,
+ [struct rw_semaphore member wait_lock is raw_spinlock_t])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 3.16 API Change
+dnl #
+dnl # rwsem-spinlock "->activity" changed to "->count"
+dnl #
+AC_DEFUN([SPL_AC_RWSEM_ACTIVITY], [
+ AC_MSG_CHECKING([whether struct rw_semaphore has member activity])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/rwsem.h>
+ ],[
+ struct rw_semaphore dummy_semaphore __attribute__ ((unused));
+ dummy_semaphore.activity = 0;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1,
+ [struct rw_semaphore has member activity])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 4.8 API Change
+dnl #
+dnl # rwsem "->count" changed to atomic_long_t type
+dnl #
+AC_DEFUN([SPL_AC_RWSEM_ATOMIC_LONG_COUNT], [
+ AC_MSG_CHECKING(
+ [whether struct rw_semaphore has atomic_long_t member count])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/rwsem.h>
+ ],[
+ DECLARE_RWSEM(dummy_semaphore);
+ (void) atomic_long_read(&dummy_semaphore.count);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_RWSEM_ATOMIC_LONG_COUNT, 1,
+ [struct rw_semaphore has atomic_long_t member count])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-sched.m4 b/config/kernel-sched.m4
new file mode 100644
index 000000000..5ae21676e
--- /dev/null
+++ b/config/kernel-sched.m4
@@ -0,0 +1,56 @@
+dnl #
+dnl # 3.9 API change,
+dnl # Moved things from linux/sched.h to linux/sched/rt.h
+dnl #
+AC_DEFUN([SPL_AC_SCHED_RT_HEADER],
+ [AC_MSG_CHECKING([whether header linux/sched/rt.h exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sched.h>
+ #include <linux/sched/rt.h>
+ ],[
+ return 0;
+ ],[
+ AC_DEFINE(HAVE_SCHED_RT_HEADER, 1, [linux/sched/rt.h exists])
+ AC_MSG_RESULT(yes)
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # 4.11 API change,
+dnl # Moved things from linux/sched.h to linux/sched/signal.h
+dnl #
+AC_DEFUN([SPL_AC_SCHED_SIGNAL_HEADER],
+ [AC_MSG_CHECKING([whether header linux/sched/signal.h exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sched.h>
+ #include <linux/sched/signal.h>
+ ],[
+ return 0;
+ ],[
+ AC_DEFINE(HAVE_SCHED_SIGNAL_HEADER, 1, [linux/sched/signal.h exists])
+ AC_MSG_RESULT(yes)
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+dnl #
+dnl # 3.19 API change
+dnl # The io_schedule_timeout() function is present in all 2.6.32 kernels
+dnl # but it was not exported until Linux 3.19. The RHEL 7.x kernels which
+dnl # are based on a 3.10 kernel do export this symbol.
+dnl #
+AC_DEFUN([SPL_AC_IO_SCHEDULE_TIMEOUT], [
+ AC_MSG_CHECKING([whether io_schedule_timeout() is available])
+ SPL_LINUX_TRY_COMPILE_SYMBOL([
+ #include <linux/sched.h>
+ ], [
+ (void) io_schedule_timeout(1);
+ ], [io_schedule_timeout], [], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_IO_SCHEDULE_TIMEOUT, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-set-fs-pwd.m4 b/config/kernel-set-fs-pwd.m4
new file mode 100644
index 000000000..849e7e6cb
--- /dev/null
+++ b/config/kernel-set-fs-pwd.m4
@@ -0,0 +1,39 @@
+dnl #
+dnl # 3.9 API change
+dnl # set_fs_pwd takes const struct path *
+dnl #
+AC_DEFUN([SPL_AC_SET_FS_PWD_WITH_CONST],
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ [AC_MSG_CHECKING([whether set_fs_pwd() requires const struct path *])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/spinlock.h>
+ #include <linux/fs_struct.h>
+ #include <linux/path.h>
+ void (*const set_fs_pwd_func)
+ (struct fs_struct *, const struct path *)
+ = set_fs_pwd;
+ ],[
+ return 0;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SET_FS_PWD_WITH_CONST, 1,
+ [set_fs_pwd() needs const path *])
+ ],[
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/spinlock.h>
+ #include <linux/fs_struct.h>
+ #include <linux/path.h>
+ void (*const set_fs_pwd_func)
+ (struct fs_struct *, struct path *)
+ = set_fs_pwd;
+ ],[
+ return 0;
+ ],[
+ AC_MSG_RESULT(no)
+ ],[
+ AC_MSG_ERROR(unknown)
+ ])
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-shrinker.m4 b/config/kernel-shrinker.m4
new file mode 100644
index 000000000..6fc9b5422
--- /dev/null
+++ b/config/kernel-shrinker.m4
@@ -0,0 +1,125 @@
+AC_DEFUN([SPL_AC_SHRINKER_CALLBACK],[
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ dnl #
+ dnl # 2.6.23 to 2.6.34 API change
+ dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask)
+ dnl #
+ AC_MSG_CHECKING([whether old 2-argument shrinker exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ int shrinker_cb(int nr_to_scan, gfp_t gfp_mask);
+ ],[
+ struct shrinker cache_shrinker = {
+ .shrink = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1,
+ [old shrinker callback wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ dnl #
+ dnl # 2.6.35 - 2.6.39 API change
+ dnl # ->shrink(struct shrinker *,
+ dnl # int nr_to_scan, gfp_t gfp_mask)
+ dnl #
+ AC_MSG_CHECKING([whether old 3-argument shrinker exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ int shrinker_cb(struct shrinker *, int nr_to_scan,
+ gfp_t gfp_mask);
+ ],[
+ struct shrinker cache_shrinker = {
+ .shrink = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1,
+ [old shrinker callback wants 3 args])
+ ],[
+ AC_MSG_RESULT(no)
+ dnl #
+ dnl # 3.0 - 3.11 API change
+ dnl # ->shrink(struct shrinker *,
+ dnl # struct shrink_control *sc)
+ dnl #
+ AC_MSG_CHECKING(
+ [whether new 2-argument shrinker exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ int shrinker_cb(struct shrinker *,
+ struct shrink_control *sc);
+ ],[
+ struct shrinker cache_shrinker = {
+ .shrink = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1,
+ [new shrinker callback wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ dnl #
+ dnl # 3.12 API change,
+ dnl # ->shrink() is logically split in to
+ dnl # ->count_objects() and ->scan_objects()
+ dnl #
+ AC_MSG_CHECKING(
+ [whether ->count_objects callback exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ unsigned long shrinker_cb(
+ struct shrinker *,
+ struct shrink_control *sc);
+ ],[
+ struct shrinker cache_shrinker = {
+ .count_objects = shrinker_cb,
+ .scan_objects = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK,
+ 1, [->count_objects exists])
+ ],[
+ AC_MSG_ERROR(error)
+ ])
+ ])
+ ])
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 2.6.39 API change,
+dnl # Shrinker adjust to use common shrink_control structure.
+dnl #
+AC_DEFUN([SPL_AC_SHRINK_CONTROL_STRUCT], [
+ AC_MSG_CHECKING([whether struct shrink_control exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+ ],[
+ struct shrink_control sc __attribute__ ((unused));
+
+ sc.nr_to_scan = 0;
+ sc.gfp_mask = GFP_KERNEL;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1,
+ [struct shrink_control exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-spinlock.m4 b/config/kernel-spinlock.m4
new file mode 100644
index 000000000..136262d0e
--- /dev/null
+++ b/config/kernel-spinlock.m4
@@ -0,0 +1,24 @@
+dnl #
+dnl # 2.6.36 API change,
+dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to
+dnl # a spinlock_t to improve the fastpath performance.
+dnl #
+AC_DEFUN([SPL_AC_FS_STRUCT_SPINLOCK], [
+ AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sched.h>
+ #include <linux/fs_struct.h>
+ ],[
+ static struct fs_struct fs;
+ spin_lock_init(&fs.lock);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1,
+ [struct fs_struct uses spinlock_t])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4
new file mode 100644
index 000000000..93b5158b9
--- /dev/null
+++ b/config/kernel-timer.m4
@@ -0,0 +1,32 @@
+dnl #
+dnl # 4.15 API change
+dnl # https://lkml.org/lkml/2017/11/25/90
+dnl # Check if timer_list.func get passed a timer_list or an unsigned long
+dnl # (older kernels). Also sanity check the from_timer() and timer_setup()
+dnl # macros are available as well, since they will be used in the same newer
+dnl # kernels that support the new timer_list.func signature.
+dnl #
+AC_DEFUN([SPL_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [
+ AC_MSG_CHECKING([whether timer_list.function gets a timer_list])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/timer.h>
+ void task_expire(struct timer_list *tl) {}
+ ],[
+ #ifndef from_timer
+ #error "No from_timer() macro"
+ #endif
+
+ struct timer_list timer;
+ timer.function = task_expire;
+ timer_setup(&timer, NULL, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1,
+ [timer_list.function gets a timer_list])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-trim-unused-symbols.m4 b/config/kernel-trim-unused-symbols.m4
new file mode 100644
index 000000000..d1ac2f3c8
--- /dev/null
+++ b/config/kernel-trim-unused-symbols.m4
@@ -0,0 +1,19 @@
+dnl #
+dnl # config trim unused symbols,
+dnl # Verify the kernel has CONFIG_TRIM_UNUSED_KSYMS DISABLED.
+dnl #
+AC_DEFUN([SPL_AC_CONFIG_TRIM_UNUSED_KSYMS], [
+ AC_MSG_CHECKING([whether CONFIG_TRIM_UNUSED_KSYM is disabled])
+ SPL_LINUX_TRY_COMPILE([
+ #if defined(CONFIG_TRIM_UNUSED_KSYMS)
+ #error CONFIG_TRIM_UNUSED_KSYMS not defined
+ #endif
+ ],[ ],[
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([
+ *** This kernel has unused symbols trimming enabled, please disable.
+ *** Rebuild the kernel with CONFIG_TRIM_UNUSED_KSYMS=n set.])
+ ])
+])
diff --git a/config/kernel-urange-sleep.m4 b/config/kernel-urange-sleep.m4
new file mode 100644
index 000000000..85beca6dd
--- /dev/null
+++ b/config/kernel-urange-sleep.m4
@@ -0,0 +1,21 @@
+dnl #
+dnl # 2.6.36 API compatibility.
+dnl # Added usleep_range timer.
+dnl # usleep_range is a finer precision implementation of msleep
+dnl # designed to be a drop-in replacement for udelay where a precise
+dnl # sleep / busy-wait is unnecessary.
+dnl #
+AC_DEFUN([SPL_AC_USLEEP_RANGE], [
+ AC_MSG_CHECKING([whether usleep_range() is available])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/delay.h>
+ ],[
+ usleep_range(0, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_USLEEP_RANGE, 1,
+ [usleep_range is available])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-vfs-fsync.m4 b/config/kernel-vfs-fsync.m4
new file mode 100644
index 000000000..3c42bf1a0
--- /dev/null
+++ b/config/kernel-vfs-fsync.m4
@@ -0,0 +1,17 @@
+dnl #
+dnl # 2.6.35 API change,
+dnl # Unused 'struct dentry *' removed from vfs_fsync() prototype.
+dnl #
+AC_DEFUN([SPL_AC_2ARGS_VFS_FSYNC], [
+ AC_MSG_CHECKING([whether vfs_fsync() wants 2 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_fsync(NULL, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_VFS_FSYNC, 1, [vfs_fsync() wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-vfs-getattr.m4 b/config/kernel-vfs-getattr.m4
new file mode 100644
index 000000000..7772cb514
--- /dev/null
+++ b/config/kernel-vfs-getattr.m4
@@ -0,0 +1,62 @@
+dnl #
+dnl # 4.11 API, a528d35e@torvalds/linux
+dnl # vfs_getattr(const struct path *p, struct kstat *s, u32 m, unsigned int f)
+dnl #
+AC_DEFUN([SPL_AC_4ARGS_VFS_GETATTR], [
+ AC_MSG_CHECKING([whether vfs_getattr() wants 4 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_getattr((const struct path *)NULL,
+ (struct kstat *)NULL,
+ (u32)0,
+ (unsigned int)0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_4ARGS_VFS_GETATTR, 1,
+ [vfs_getattr wants 4 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # 3.9 API
+dnl # vfs_getattr(struct path *p, struct kstat *s)
+dnl #
+AC_DEFUN([SPL_AC_2ARGS_VFS_GETATTR], [
+ AC_MSG_CHECKING([whether vfs_getattr() wants 2 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_getattr((struct path *) NULL,
+ (struct kstat *)NULL);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_VFS_GETATTR, 1,
+ [vfs_getattr wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # <3.9 API
+dnl # vfs_getattr(struct vfsmount *v, struct dentry *d, struct kstat *k)
+dnl #
+AC_DEFUN([SPL_AC_3ARGS_VFS_GETATTR], [
+ AC_MSG_CHECKING([whether vfs_getattr() wants 3 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_getattr((struct vfsmount *)NULL,
+ (struct dentry *)NULL,
+ (struct kstat *)NULL);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_3ARGS_VFS_GETATTR, 1,
+ [vfs_getattr wants 3 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-wait.m4 b/config/kernel-wait.m4
new file mode 100644
index 000000000..5f718a160
--- /dev/null
+++ b/config/kernel-wait.m4
@@ -0,0 +1,76 @@
+dnl #
+dnl # 3.17 API change,
+dnl # wait_on_bit() no longer requires an action argument. The former
+dnl # "wait_on_bit" interface required an 'action' function to be provided
+dnl # which does the actual waiting. There were over 20 such functions in the
+dnl # kernel, many of them identical, though most cases can be satisfied by one
+dnl # of just two functions: one which uses io_schedule() and one which just
+dnl # uses schedule(). This API change was made to consolidate all of those
+dnl # redundant wait functions.
+dnl #
+AC_DEFUN([SPL_AC_WAIT_ON_BIT], [
+ AC_MSG_CHECKING([whether wait_on_bit() takes an action])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/wait.h>
+ ],[
+ int (*action)(void *) = NULL;
+ wait_on_bit(NULL, 0, action, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+dnl #
+dnl # 4.13 API change
+dnl # Renamed struct wait_queue -> struct wait_queue_entry.
+dnl #
+AC_DEFUN([SPL_AC_WAIT_QUEUE_ENTRY_T], [
+ AC_MSG_CHECKING([whether wait_queue_entry_t exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/wait.h>
+ ],[
+ wait_queue_entry_t *entry __attribute__ ((unused));
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_WAIT_QUEUE_ENTRY_T, 1,
+ [wait_queue_entry_t exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # 4.13 API change
+dnl # Renamed wait_queue_head::task_list -> wait_queue_head::head
+dnl # Renamed wait_queue_entry::task_list -> wait_queue_entry::entry
+dnl #
+AC_DEFUN([SPL_AC_WAIT_QUEUE_HEAD_ENTRY], [
+ AC_MSG_CHECKING([whether wq_head->head and wq_entry->entry exist])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/wait.h>
+
+ #ifdef HAVE_WAIT_QUEUE_ENTRY_T
+ typedef wait_queue_head_t spl_wait_queue_head_t;
+ typedef wait_queue_entry_t spl_wait_queue_entry_t;
+ #else
+ typedef wait_queue_head_t spl_wait_queue_head_t;
+ typedef wait_queue_t spl_wait_queue_entry_t;
+ #endif
+ ],[
+ spl_wait_queue_head_t wq_head;
+ spl_wait_queue_entry_t wq_entry;
+ struct list_head *head __attribute__ ((unused));
+ struct list_head *entry __attribute__ ((unused));
+
+ head = &wq_head.head;
+ entry = &wq_entry.entry;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_WAIT_QUEUE_HEAD_ENTRY, 1,
+ [wq_head->head and wq_entry->entry exist])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-zlib.m4 b/config/kernel-zlib.m4
new file mode 100644
index 000000000..bb236466a
--- /dev/null
+++ b/config/kernel-zlib.m4
@@ -0,0 +1,63 @@
+dnl #
+dnl # zlib inflate compat,
+dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled.
+dnl #
+AC_DEFUN([SPL_AC_CONFIG_ZLIB_INFLATE], [
+ AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined])
+ SPL_LINUX_TRY_COMPILE([
+ #if !defined(CONFIG_ZLIB_INFLATE) && \
+ !defined(CONFIG_ZLIB_INFLATE_MODULE)
+ #error CONFIG_ZLIB_INFLATE not defined
+ #endif
+ ],[ ],[
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([
+ *** This kernel does not include the required zlib inflate support.
+ *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.])
+ ])
+])
+
+dnl #
+dnl # zlib deflate compat,
+dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled.
+dnl #
+AC_DEFUN([SPL_AC_CONFIG_ZLIB_DEFLATE], [
+ AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined])
+ SPL_LINUX_TRY_COMPILE([
+ #if !defined(CONFIG_ZLIB_DEFLATE) && \
+ !defined(CONFIG_ZLIB_DEFLATE_MODULE)
+ #error CONFIG_ZLIB_DEFLATE not defined
+ #endif
+ ],[ ],[
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([
+ *** This kernel does not include the required zlib deflate support.
+ *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.])
+ ])
+])
+
+dnl #
+dnl # 2.6.39 API compat,
+dnl # The function zlib_deflate_workspacesize() now take 2 arguments.
+dnl # This was done to avoid always having to allocate the maximum size
+dnl # workspace (268K). The caller can now specific the windowBits and
+dnl # memLevel compression parameters to get a smaller workspace.
+dnl #
+AC_DEFUN([SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE],
+ [AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/zlib.h>
+ ],[
+ return zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1,
+ [zlib_deflate_workspacesize() wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/include/spl/rpc/xdr.h b/include/spl/rpc/xdr.h
new file mode 100644
index 000000000..0b39b46cf
--- /dev/null
+++ b/include/spl/rpc/xdr.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2008 Sun Microsystems, Inc.
+ * Written by Ricardo Correia <[email protected]>
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RPC_XDR_H
+#define _SPL_RPC_XDR_H
+
+#include <sys/types.h>
+
+typedef int bool_t;
+
+/*
+ * XDR enums and types.
+ */
+enum xdr_op {
+ XDR_ENCODE,
+ XDR_DECODE
+};
+
+struct xdr_ops;
+
+typedef struct {
+ struct xdr_ops *x_ops; /* Let caller know xdrmem_create() succeeds */
+ caddr_t x_addr; /* Current buffer addr */
+ caddr_t x_addr_end; /* End of the buffer */
+ enum xdr_op x_op; /* Stream direction */
+} XDR;
+
+typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr);
+
+struct xdr_ops {
+ bool_t (*xdr_control)(XDR *, int, void *);
+
+ bool_t (*xdr_char)(XDR *, char *);
+ bool_t (*xdr_u_short)(XDR *, unsigned short *);
+ bool_t (*xdr_u_int)(XDR *, unsigned *);
+ bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *);
+
+ bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t);
+ bool_t (*xdr_string)(XDR *, char **, const uint_t);
+ bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t,
+ const uint_t, const xdrproc_t);
+};
+
+/*
+ * XDR control operator.
+ */
+#define XDR_GET_BYTES_AVAIL 1
+
+struct xdr_bytesrec {
+ bool_t xc_is_last_record;
+ size_t xc_num_avail;
+};
+
+/*
+ * XDR functions.
+ */
+void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+ const enum xdr_op op);
+
+/* Currently not needed. If needed later, we'll add it to struct xdr_ops */
+#define xdr_destroy(xdrs) ((void) 0)
+
+#define xdr_control(xdrs, req, info) \
+ (xdrs)->x_ops->xdr_control((xdrs), (req), (info))
+
+/*
+ * For precaution, the following are defined as static inlines instead of macros
+ * to get some amount of type safety.
+ *
+ * Also, macros wouldn't work in the case where typecasting is done, because it
+ * must be possible to reference the functions' addresses by these names.
+ */
+static inline bool_t xdr_char(XDR *xdrs, char *cp)
+{
+ return (xdrs->x_ops->xdr_char(xdrs, cp));
+}
+
+static inline bool_t xdr_u_short(XDR *xdrs, unsigned short *usp)
+{
+ return (xdrs->x_ops->xdr_u_short(xdrs, usp));
+}
+
+static inline bool_t xdr_short(XDR *xdrs, short *sp)
+{
+ BUILD_BUG_ON(sizeof (short) != 2);
+ return (xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp));
+}
+
+static inline bool_t xdr_u_int(XDR *xdrs, unsigned *up)
+{
+ return (xdrs->x_ops->xdr_u_int(xdrs, up));
+}
+
+static inline bool_t xdr_int(XDR *xdrs, int *ip)
+{
+ BUILD_BUG_ON(sizeof (int) != 4);
+ return (xdrs->x_ops->xdr_u_int(xdrs, (unsigned *)ip));
+}
+
+static inline bool_t xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp)
+{
+ return (xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp));
+}
+
+static inline bool_t xdr_longlong_t(XDR *xdrs, longlong_t *llp)
+{
+ BUILD_BUG_ON(sizeof (longlong_t) != 8);
+ return (xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *)llp));
+}
+
+/*
+ * Fixed-length opaque data.
+ */
+static inline bool_t xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ return (xdrs->x_ops->xdr_opaque(xdrs, cp, cnt));
+}
+
+/*
+ * Variable-length string.
+ * The *sp buffer must have (maxsize + 1) bytes.
+ */
+static inline bool_t xdr_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ return (xdrs->x_ops->xdr_string(xdrs, sp, maxsize));
+}
+
+/*
+ * Variable-length arrays.
+ */
+static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep,
+ const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc)
+{
+ return xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize,
+ elproc);
+}
+
+#endif /* SPL_RPC_XDR_H */
diff --git a/include/spl/sys/acl.h b/include/spl/sys/acl.h
new file mode 100644
index 000000000..9fc79c025
--- /dev/null
+++ b/include/spl/sys/acl.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ACL_H
+#define _SPL_ACL_H
+
+#include <sys/types.h>
+
+typedef struct ace {
+ uid_t a_who;
+ uint32_t a_access_mask;
+ uint16_t a_flags;
+ uint16_t a_type;
+} ace_t;
+
+typedef struct ace_object {
+ uid_t a_who; /* uid or gid */
+ uint32_t a_access_mask; /* read,write,... */
+ uint16_t a_flags; /* see below */
+ uint16_t a_type; /* allow or deny */
+ uint8_t a_obj_type[16]; /* obj type */
+ uint8_t a_inherit_obj_type[16]; /* inherit obj */
+} ace_object_t;
+
+#define MAX_ACL_ENTRIES 1024
+
+#define ACE_READ_DATA 0x00000001
+#define ACE_LIST_DIRECTORY 0x00000001
+#define ACE_WRITE_DATA 0x00000002
+#define ACE_ADD_FILE 0x00000002
+#define ACE_APPEND_DATA 0x00000004
+#define ACE_ADD_SUBDIRECTORY 0x00000004
+#define ACE_READ_NAMED_ATTRS 0x00000008
+#define ACE_WRITE_NAMED_ATTRS 0x00000010
+#define ACE_EXECUTE 0x00000020
+#define ACE_DELETE_CHILD 0x00000040
+#define ACE_READ_ATTRIBUTES 0x00000080
+#define ACE_WRITE_ATTRIBUTES 0x00000100
+#define ACE_DELETE 0x00010000
+#define ACE_READ_ACL 0x00020000
+#define ACE_WRITE_ACL 0x00040000
+#define ACE_WRITE_OWNER 0x00080000
+#define ACE_SYNCHRONIZE 0x00100000
+
+#define ACE_FILE_INHERIT_ACE 0x0001
+#define ACE_DIRECTORY_INHERIT_ACE 0x0002
+#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
+#define ACE_INHERIT_ONLY_ACE 0x0008
+#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
+#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
+#define ACE_IDENTIFIER_GROUP 0x0040
+#define ACE_INHERITED_ACE 0x0080
+#define ACE_OWNER 0x1000
+#define ACE_GROUP 0x2000
+#define ACE_EVERYONE 0x4000
+
+#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
+#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
+#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
+#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
+
+#define ACL_AUTO_INHERIT 0x0001
+#define ACL_PROTECTED 0x0002
+#define ACL_DEFAULTED 0x0004
+#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED)
+
+#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
+#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
+#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
+#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
+#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
+#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
+#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
+#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E
+#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10
+
+#define ACE_ALL_TYPES 0x001F
+
+#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP)
+
+/* BEGIN CSTYLED */
+#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
+ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
+ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
+ ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
+/* END CSTYLED */
+
+#define VSA_ACE 0x0010
+#define VSA_ACECNT 0x0020
+#define VSA_ACE_ALLTYPES 0x0040
+#define VSA_ACE_ACLFLAGS 0x0080
+
+#endif /* _SPL_ACL_H */
diff --git a/include/spl/sys/atomic.h b/include/spl/sys/atomic.h
new file mode 100644
index 000000000..51b547923
--- /dev/null
+++ b/include/spl/sys/atomic.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ATOMIC_H
+#define _SPL_ATOMIC_H
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <sys/types.h>
+
+/*
+ * Map the atomic_* functions to the Linux counterparts. This relies on the
+ * fact that the atomic types are internally really a uint32 or uint64. If
+ * this were to change an alternate approach would be needed.
+ *
+ * N.B. Due to the limitations of the original API atomicity is not strictly
+ * preserved when using the 64-bit functions on a 32-bit system. In order
+ * to support this all consumers would need to be updated to use the Linux
+ * provided atomic_t and atomic64_t types.
+ */
+#define atomic_inc_32(v) atomic_inc((atomic_t *)(v))
+#define atomic_dec_32(v) atomic_dec((atomic_t *)(v))
+#define atomic_add_32(v, i) atomic_add((i), (atomic_t *)(v))
+#define atomic_sub_32(v, i) atomic_sub((i), (atomic_t *)(v))
+#define atomic_inc_32_nv(v) atomic_inc_return((atomic_t *)(v))
+#define atomic_dec_32_nv(v) atomic_dec_return((atomic_t *)(v))
+#define atomic_add_32_nv(v, i) atomic_add_return((i), (atomic_t *)(v))
+#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v))
+#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y)
+#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x)
+#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v))
+#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v))
+#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v))
+#define atomic_sub_64(v, i) atomic64_sub((i), (atomic64_t *)(v))
+#define atomic_inc_64_nv(v) atomic64_inc_return((atomic64_t *)(v))
+#define atomic_dec_64_nv(v) atomic64_dec_return((atomic64_t *)(v))
+#define atomic_add_64_nv(v, i) atomic64_add_return((i), (atomic64_t *)(v))
+#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v))
+#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y)
+#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x)
+
+#ifdef _LP64
+static __inline__ void *
+atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
+{
+ return ((void *)atomic_cas_64((volatile uint64_t *)target,
+ (uint64_t)cmp, (uint64_t)newval));
+}
+#else /* _LP64 */
+static __inline__ void *
+atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
+{
+ return ((void *)atomic_cas_32((volatile uint32_t *)target,
+ (uint32_t)cmp, (uint32_t)newval));
+}
+#endif /* _LP64 */
+
+#endif /* _SPL_ATOMIC_H */
diff --git a/include/spl/sys/byteorder.h b/include/spl/sys/byteorder.h
new file mode 100644
index 000000000..477707996
--- /dev/null
+++ b/include/spl/sys/byteorder.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_BYTEORDER_H
+#define _SPL_BYTEORDER_H
+
+#include <asm/byteorder.h>
+#include <sys/isa_defs.h>
+
+#define BSWAP_8(x) ((x) & 0xff)
+#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+
+#define LE_16(x) cpu_to_le16(x)
+#define LE_32(x) cpu_to_le32(x)
+#define LE_64(x) cpu_to_le64(x)
+#define BE_16(x) cpu_to_be16(x)
+#define BE_32(x) cpu_to_be32(x)
+#define BE_64(x) cpu_to_be64(x)
+
+#define BE_IN8(xa) \
+ *((uint8_t *)(xa))
+
+#define BE_IN16(xa) \
+ (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1))
+
+#define BE_IN32(xa) \
+ (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2))
+
+#ifdef _BIG_ENDIAN
+static __inline__ uint64_t
+htonll(uint64_t n)
+{
+ return (n);
+}
+
+static __inline__ uint64_t
+ntohll(uint64_t n)
+{
+ return (n);
+}
+#else
+static __inline__ uint64_t
+htonll(uint64_t n)
+{
+ return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32));
+}
+
+static __inline__ uint64_t
+ntohll(uint64_t n)
+{
+ return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32));
+}
+#endif
+
+#endif /* SPL_BYTEORDER_H */
diff --git a/include/spl/sys/callb.h b/include/spl/sys/callb.h
new file mode 100644
index 000000000..f1826bfd3
--- /dev/null
+++ b/include/spl/sys/callb.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CALLB_H
+#define _SPL_CALLB_H
+
+#include <linux/module.h>
+#include <sys/mutex.h>
+
+#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp));
+
+typedef struct callb_cpr {
+ kmutex_t *cc_lockp;
+} callb_cpr_t;
+
+#define CALLB_CPR_INIT(cp, lockp, func, name) { \
+ (cp)->cc_lockp = lockp; \
+}
+
+#define CALLB_CPR_SAFE_BEGIN(cp) { \
+ CALLB_CPR_ASSERT(cp); \
+}
+
+#define CALLB_CPR_SAFE_END(cp, lockp) { \
+ CALLB_CPR_ASSERT(cp); \
+}
+
+#define CALLB_CPR_EXIT(cp) { \
+ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \
+ mutex_exit((cp)->cc_lockp); \
+}
+
+#endif /* _SPL_CALLB_H */
diff --git a/include/spl/sys/callo.h b/include/spl/sys/callo.h
new file mode 100644
index 000000000..c43ac92e7
--- /dev/null
+++ b/include/spl/sys/callo.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CALLO_H
+#define _SPL_CALLO_H
+
+/*
+ * Callout flags:
+ *
+ * CALLOUT_FLAG_ROUNDUP
+ * Roundup the expiration time to the next resolution boundary.
+ * If this flag is not specified, the expiration time is rounded down.
+ * CALLOUT_FLAG_ABSOLUTE
+ * Normally, the expiration passed to the timeout API functions is an
+ * expiration interval. If this flag is specified, then it is
+ * interpreted as the expiration time itself.
+ * CALLOUT_FLAG_HRESTIME
+ * Normally, callouts are not affected by changes to system time
+ * (hrestime). This flag is used to create a callout that is affected
+ * by system time. If system time changes, these timers must be
+ * handled in a special way (see callout.c). These are used by condition
+ * variables and LWP timers that need this behavior.
+ * CALLOUT_FLAG_32BIT
+ * Legacy interfaces timeout() and realtime_timeout() pass this flag
+ * to timeout_generic() to indicate that a 32-bit ID should be allocated.
+ */
+#define CALLOUT_FLAG_ROUNDUP 0x1
+#define CALLOUT_FLAG_ABSOLUTE 0x2
+#define CALLOUT_FLAG_HRESTIME 0x4
+#define CALLOUT_FLAG_32BIT 0x8
+
+#endif /* _SPL_CALLB_H */
diff --git a/include/spl/sys/cmn_err.h b/include/spl/sys/cmn_err.h
new file mode 100644
index 000000000..be57358b0
--- /dev/null
+++ b/include/spl/sys/cmn_err.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CMN_ERR_H
+#define _SPL_CMN_ERR_H
+
+#include <stdarg.h>
+
+#define CE_CONT 0 /* continuation */
+#define CE_NOTE 1 /* notice */
+#define CE_WARN 2 /* warning */
+#define CE_PANIC 3 /* panic */
+#define CE_IGNORE 4 /* print nothing */
+
+extern void cmn_err(int, const char *, ...);
+extern void vcmn_err(int, const char *, va_list);
+extern void vpanic(const char *, va_list);
+
+#define fm_panic panic
+
+#endif /* SPL_CMN_ERR_H */
diff --git a/include/spl/sys/condvar.h b/include/spl/sys/condvar.h
new file mode 100644
index 000000000..1d47cdd96
--- /dev/null
+++ b/include/spl/sys/condvar.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CONDVAR_H
+#define _SPL_CONDVAR_H
+
+#include <linux/module.h>
+#include <sys/kmem.h>
+#include <sys/mutex.h>
+#include <sys/callo.h>
+#include <sys/wait.h>
+
+/*
+ * The kcondvar_t struct is protected by mutex taken externally before
+ * calling any of the wait/signal funs, and passed into the wait funs.
+ */
+#define CV_MAGIC 0x346545f4
+#define CV_DESTROY 0x346545f5
+
+typedef struct {
+ int cv_magic;
+ spl_wait_queue_head_t cv_event;
+ spl_wait_queue_head_t cv_destroy;
+ atomic_t cv_refs;
+ atomic_t cv_waiters;
+ kmutex_t *cv_mutex;
+} kcondvar_t;
+
+typedef enum { CV_DEFAULT = 0, CV_DRIVER } kcv_type_t;
+
+extern void __cv_init(kcondvar_t *, char *, kcv_type_t, void *);
+extern void __cv_destroy(kcondvar_t *);
+extern void __cv_wait(kcondvar_t *, kmutex_t *);
+extern void __cv_wait_io(kcondvar_t *, kmutex_t *);
+extern void __cv_wait_sig(kcondvar_t *, kmutex_t *);
+extern clock_t __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t,
+ hrtime_t res, int flag);
+extern clock_t cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t,
+ hrtime_t res, int flag);
+extern void __cv_signal(kcondvar_t *);
+extern void __cv_broadcast(kcondvar_t *c);
+
+#define cv_init(cvp, name, type, arg) __cv_init(cvp, name, type, arg)
+#define cv_destroy(cvp) __cv_destroy(cvp)
+#define cv_wait(cvp, mp) __cv_wait(cvp, mp)
+#define cv_wait_io(cvp, mp) __cv_wait_io(cvp, mp)
+#define cv_wait_sig(cvp, mp) __cv_wait_sig(cvp, mp)
+#define cv_wait_interruptible(cvp, mp) cv_wait_sig(cvp, mp)
+#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t)
+#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t)
+#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t)
+#define cv_timedwait_interruptible(cvp, mp, t) cv_timedwait_sig(cvp, mp, t)
+#define cv_signal(cvp) __cv_signal(cvp)
+#define cv_broadcast(cvp) __cv_broadcast(cvp)
+
+#endif /* _SPL_CONDVAR_H */
diff --git a/include/spl/sys/console.h b/include/spl/sys/console.h
new file mode 100644
index 000000000..3469cb762
--- /dev/null
+++ b/include/spl/sys/console.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CONSOLE_H
+#define _SPL_CONSOLE_H
+
+void
+console_vprintf(const char *fmt, va_list args)
+{
+ vprintk(fmt, args);
+}
+
+void
+console_printf(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ console_vprintf(fmt, args);
+ va_end(args);
+}
+
+#endif /* _SPL_CONSOLE_H */
diff --git a/include/spl/sys/cred.h b/include/spl/sys/cred.h
new file mode 100644
index 000000000..fd063399b
--- /dev/null
+++ b/include/spl/sys/cred.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CRED_H
+#define _SPL_CRED_H
+
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+
+typedef struct cred cred_t;
+
+#define kcred ((cred_t *)(init_task.cred))
+#define CRED() ((cred_t *)current_cred())
+
+/* Linux 4.9 API change, GROUP_AT was removed */
+#ifndef GROUP_AT
+#define GROUP_AT(gi, i) ((gi)->gid[i])
+#endif
+
+#ifdef HAVE_KUIDGID_T
+
+#define KUID_TO_SUID(x) (__kuid_val(x))
+#define KGID_TO_SGID(x) (__kgid_val(x))
+#define SUID_TO_KUID(x) (KUIDT_INIT(x))
+#define SGID_TO_KGID(x) (KGIDT_INIT(x))
+#define KGIDP_TO_SGIDP(x) (&(x)->val)
+
+#else /* HAVE_KUIDGID_T */
+
+#define KUID_TO_SUID(x) (x)
+#define KGID_TO_SGID(x) (x)
+#define SUID_TO_KUID(x) (x)
+#define SGID_TO_KGID(x) (x)
+#define KGIDP_TO_SGIDP(x) (x)
+
+#endif /* HAVE_KUIDGID_T */
+
+extern void crhold(cred_t *cr);
+extern void crfree(cred_t *cr);
+extern uid_t crgetuid(const cred_t *cr);
+extern uid_t crgetruid(const cred_t *cr);
+extern uid_t crgetsuid(const cred_t *cr);
+extern uid_t crgetfsuid(const cred_t *cr);
+extern gid_t crgetgid(const cred_t *cr);
+extern gid_t crgetrgid(const cred_t *cr);
+extern gid_t crgetsgid(const cred_t *cr);
+extern gid_t crgetfsgid(const cred_t *cr);
+extern int crgetngroups(const cred_t *cr);
+extern gid_t *crgetgroups(const cred_t *cr);
+extern int groupmember(gid_t gid, const cred_t *cr);
+
+#endif /* _SPL_CRED_H */
diff --git a/include/spl/sys/ctype.h b/include/spl/sys/ctype.h
new file mode 100644
index 000000000..18beb1daa
--- /dev/null
+++ b/include/spl/sys/ctype.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CTYPE_H
+#define _SPL_CTYPE_H
+
+#include <linux/ctype.h>
+
+#endif /* SPL_CTYPE_H */
diff --git a/include/spl/sys/debug.h b/include/spl/sys/debug.h
new file mode 100644
index 000000000..a4a458066
--- /dev/null
+++ b/include/spl/sys/debug.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Available Solaris debug functions. All of the ASSERT() macros will be
+ * compiled out when NDEBUG is defined, this is the default behavior for
+ * the SPL. To enable assertions use the --enable-debug with configure.
+ * The VERIFY() functions are never compiled out and cannot be disabled.
+ *
+ * PANIC() - Panic the node and print message.
+ * ASSERT() - Assert X is true, if not panic.
+ * ASSERTV() - Wraps a variable declaration which is only used by ASSERT().
+ * ASSERT3B() - Assert boolean X OP Y is true, if not panic.
+ * ASSERT3S() - Assert signed X OP Y is true, if not panic.
+ * ASSERT3U() - Assert unsigned X OP Y is true, if not panic.
+ * ASSERT3P() - Assert pointer X OP Y is true, if not panic.
+ * ASSERT0() - Assert value is zero, if not panic.
+ * VERIFY() - Verify X is true, if not panic.
+ * VERIFY3B() - Verify boolean X OP Y is true, if not panic.
+ * VERIFY3S() - Verify signed X OP Y is true, if not panic.
+ * VERIFY3U() - Verify unsigned X OP Y is true, if not panic.
+ * VERIFY3P() - Verify pointer X OP Y is true, if not panic.
+ * VERIFY0() - Verify value is zero, if not panic.
+ */
+
+#ifndef _SPL_DEBUG_H
+#define _SPL_DEBUG_H
+
+/*
+ * Common DEBUG functionality.
+ */
+int spl_panic(const char *file, const char *func, int line,
+ const char *fmt, ...);
+void spl_dumpstack(void);
+
+/* BEGIN CSTYLED */
+#define PANIC(fmt, a...) \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
+
+#define VERIFY(cond) \
+ (void) (unlikely(!(cond)) && \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "%s", "VERIFY(" #cond ") failed\n"))
+
+#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) do { \
+ TYPE _verify3_left = (TYPE)(LEFT); \
+ TYPE _verify3_right = (TYPE)(RIGHT); \
+ if (!(_verify3_left OP _verify3_right)) \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \
+ "failed (" FMT " " #OP " " FMT ")\n", \
+ CAST (_verify3_left), CAST (_verify3_right)); \
+ } while (0)
+
+#define VERIFY3B(x,y,z) VERIFY3_IMPL(x, y, z, boolean_t, "%d", (boolean_t))
+#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long))
+#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \
+ (unsigned long long))
+#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *))
+#define VERIFY0(x) VERIFY3_IMPL(0, ==, x, int64_t, "%lld", (long long))
+
+#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__)
+#define CTASSERT(x) { _CTASSERT(x, __LINE__); }
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) \
+ typedef char __attribute__ ((unused)) \
+ __compile_time_assertion__ ## y[(x) ? 1 : -1]
+
+/*
+ * Debugging disabled (--disable-debug)
+ */
+#ifdef NDEBUG
+
+#define SPL_DEBUG_STR ""
+#define ASSERT(x) ((void)0)
+#define ASSERTV(x)
+#define ASSERT3B(x,y,z) ((void)0)
+#define ASSERT3S(x,y,z) ((void)0)
+#define ASSERT3U(x,y,z) ((void)0)
+#define ASSERT3P(x,y,z) ((void)0)
+#define ASSERT0(x) ((void)0)
+#define IMPLY(A, B) ((void)0)
+#define EQUIV(A, B) ((void)0)
+
+/*
+ * Debugging enabled (--enable-debug)
+ */
+#else
+
+#define SPL_DEBUG_STR " (DEBUG mode)"
+#define ASSERT(cond) VERIFY(cond)
+#define ASSERTV(x) x
+#define ASSERT3B(x,y,z) VERIFY3B(x, y, z)
+#define ASSERT3S(x,y,z) VERIFY3S(x, y, z)
+#define ASSERT3U(x,y,z) VERIFY3U(x, y, z)
+#define ASSERT3P(x,y,z) VERIFY3P(x, y, z)
+#define ASSERT0(x) VERIFY0(x)
+#define IMPLY(A, B) \
+ ((void)(((!(A)) || (B)) || \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "(" #A ") implies (" #B ")")))
+#define EQUIV(A, B) \
+ ((void)((!!(A) == !!(B)) || \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "(" #A ") is equivalent to (" #B ")")))
+/* END CSTYLED */
+
+#endif /* NDEBUG */
+
+#endif /* SPL_DEBUG_H */
diff --git a/include/spl/sys/disp.h b/include/spl/sys/disp.h
new file mode 100644
index 000000000..413b623c8
--- /dev/null
+++ b/include/spl/sys/disp.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_DISP_H
+#define _SPL_DISP_H
+
+#include <linux/preempt.h>
+
+#define kpreempt(unused) schedule()
+#define kpreempt_disable() preempt_disable()
+#define kpreempt_enable() preempt_enable()
+
+#endif /* SPL_DISP_H */
diff --git a/include/spl/sys/dkio.h b/include/spl/sys/dkio.h
new file mode 100644
index 000000000..49f166a9c
--- /dev/null
+++ b/include/spl/sys/dkio.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_DKIO_H
+#define _SPL_DKIO_H
+
+#define DFL_SZ(num_exts) \
+ (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16)
+
+#define DKIOC (0x04 << 8)
+#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
+
+/*
+ * ioctl to free space (e.g. SCSI UNMAP) off a disk.
+ * Pass a dkioc_free_list_t containing a list of extents to be freed.
+ */
+#define DKIOCFREE (DKIOC|50)
+
+#endif /* _SPL_DKIO_H */
diff --git a/include/spl/sys/dkioc_free_util.h b/include/spl/sys/dkioc_free_util.h
new file mode 100644
index 000000000..d519b2f8e
--- /dev/null
+++ b/include/spl/sys/dkioc_free_util.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_DKIOC_UTIL_H
+#define _SPL_DKIOC_UTIL_H
+
+#include <sys/dkio.h>
+
+typedef struct dkioc_free_list_ext_s {
+ uint64_t dfle_start;
+ uint64_t dfle_length;
+} dkioc_free_list_ext_t;
+
+typedef struct dkioc_free_list_s {
+ uint64_t dfl_flags;
+ uint64_t dfl_num_exts;
+ int64_t dfl_offset;
+
+ /*
+ * N.B. this is only an internal debugging API! This is only called
+ * from debug builds of sd for pre-release checking. Remove before GA!
+ */
+ void (*dfl_ck_func)(uint64_t, uint64_t, void *);
+ void *dfl_ck_arg;
+
+ dkioc_free_list_ext_t dfl_exts[1];
+} dkioc_free_list_t;
+
+static inline void dfl_free(dkioc_free_list_t *dfl) {
+ vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
+}
+
+static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) {
+ return (vmem_zalloc(DFL_SZ(dfl_num_exts), flags));
+}
+
+#endif /* _SPL_DKIOC_UTIL_H */
diff --git a/include/spl/sys/fcntl.h b/include/spl/sys/fcntl.h
new file mode 100644
index 000000000..3faa5dad7
--- /dev/null
+++ b/include/spl/sys/fcntl.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_FCNTL_H
+#define _SPL_FCNTL_H
+
+#include <asm/fcntl.h>
+
+#define F_FREESP 11
+
+#ifdef CONFIG_64BIT
+typedef struct flock flock64_t;
+#else
+typedef struct flock64 flock64_t;
+#endif /* CONFIG_64BIT */
+
+#endif /* _SPL_FCNTL_H */
diff --git a/include/spl/sys/file.h b/include/spl/sys/file.h
new file mode 100644
index 000000000..05dbc0814
--- /dev/null
+++ b/include/spl/sys/file.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_FILE_H
+#define _SPL_FILE_H
+
+#define FIGNORECASE 0x00080000
+#define FKIOCTL 0x80000000
+#define ED_CASE_CONFLICT 0x10
+
+#ifdef HAVE_INODE_LOCK_SHARED
+#define spl_inode_lock(ip) inode_lock(ip)
+#define spl_inode_unlock(ip) inode_unlock(ip)
+#define spl_inode_lock_shared(ip) inode_lock_shared(ip)
+#define spl_inode_unlock_shared(ip) inode_unlock_shared(ip)
+#define spl_inode_trylock(ip) inode_trylock(ip)
+#define spl_inode_trylock_shared(ip) inode_trylock_shared(ip)
+#define spl_inode_is_locked(ip) inode_is_locked(ip)
+#define spl_inode_lock_nested(ip, s) inode_lock_nested(ip, s)
+#else
+#define spl_inode_lock(ip) mutex_lock(&(ip)->i_mutex)
+#define spl_inode_unlock(ip) mutex_unlock(&(ip)->i_mutex)
+#define spl_inode_lock_shared(ip) mutex_lock(&(ip)->i_mutex)
+#define spl_inode_unlock_shared(ip) mutex_unlock(&(ip)->i_mutex)
+#define spl_inode_trylock(ip) mutex_trylock(&(ip)->i_mutex)
+#define spl_inode_trylock_shared(ip) mutex_trylock(&(ip)->i_mutex)
+#define spl_inode_is_locked(ip) mutex_is_locked(&(ip)->i_mutex)
+#define spl_inode_lock_nested(ip, s) mutex_lock_nested(&(ip)->i_mutex, s)
+#endif
+
+#endif /* SPL_FILE_H */
diff --git a/include/spl/sys/inttypes.h b/include/spl/sys/inttypes.h
new file mode 100644
index 000000000..92e76206b
--- /dev/null
+++ b/include/spl/sys/inttypes.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_INTTYPES_H
+#define _SPL_INTTYPES_H
+
+#endif /* SPL_INTTYPES_H */
diff --git a/include/spl/sys/isa_defs.h b/include/spl/sys/isa_defs.h
new file mode 100644
index 000000000..089ae0f85
--- /dev/null
+++ b/include/spl/sys/isa_defs.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ISA_DEFS_H
+#define _SPL_ISA_DEFS_H
+
+/* x86_64 arch specific defines */
+#if defined(__x86_64) || defined(__x86_64__)
+
+#if !defined(__x86_64)
+#define __x86_64
+#endif
+
+#if !defined(__amd64)
+#define __amd64
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+#if !defined(_LP64)
+#define _LP64
+#endif
+
+#define _ALIGNMENT_REQUIRED 1
+
+
+/* i386 arch specific defines */
+#elif defined(__i386) || defined(__i386__)
+
+#if !defined(__i386)
+#define __i386
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+
+#define _ALIGNMENT_REQUIRED 0
+
+/* powerpc (ppc64) arch specific defines */
+#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__)
+
+#if !defined(__powerpc)
+#define __powerpc
+#endif
+
+#if !defined(__powerpc__)
+#define __powerpc__
+#endif
+
+#if defined(__powerpc64__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for PPC, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+/* arm arch specific defines */
+#elif defined(__arm) || defined(__arm__) || defined(__aarch64__)
+
+#if !defined(__arm)
+#define __arm
+#endif
+
+#if !defined(__arm__)
+#define __arm__
+#endif
+
+#if defined(__aarch64__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+#if defined(__ARMEL__) || defined(__AARCH64EL__)
+#define _LITTLE_ENDIAN
+#else
+#define _BIG_ENDIAN
+#endif
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for ARM, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+/* sparc arch specific defines */
+#elif defined(__sparc) || defined(__sparc__)
+
+#if !defined(__sparc)
+#define __sparc
+#endif
+
+#if !defined(__sparc__)
+#define __sparc__
+#endif
+
+#if defined(__arch64__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+#define _BIG_ENDIAN
+#define _SUNOS_VTOC_16
+#define _ALIGNMENT_REQUIRED 1
+
+/* s390 arch specific defines */
+#elif defined(__s390__)
+#if defined(__s390x__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+#define _BIG_ENDIAN
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for s390, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+/* MIPS arch specific defines */
+#elif defined(__mips__)
+
+#if defined(__MIPSEB__)
+#define _BIG_ENDIAN
+#elif defined(__MIPSEL__)
+#define _LITTLE_ENDIAN
+#else
+#error MIPS no endian specified
+#endif
+
+#ifndef _LP64
+#define _ILP32
+#endif
+
+#define _SUNOS_VTOC_16
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for MIPS, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+#else
+/*
+ * Currently supported:
+ * x86_64, i386, arm, powerpc, s390, sparc, and mips
+ */
+#error "Unsupported ISA type"
+#endif
+
+#if defined(_ILP32) && defined(_LP64)
+#error "Both _ILP32 and _LP64 are defined"
+#endif
+
+#if !defined(_ILP32) && !defined(_LP64)
+#error "Neither _ILP32 or _LP64 are defined"
+#endif
+
+#include <sys/byteorder.h>
+
+#if defined(__LITTLE_ENDIAN) && !defined(_LITTLE_ENDIAN)
+#define _LITTLE_ENDIAN __LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN) && !defined(_BIG_ENDIAN)
+#define _BIG_ENDIAN __BIG_ENDIAN
+#endif
+
+#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
+#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined"
+#endif
+
+#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)
+#error "Neither _LITTLE_ENDIAN or _BIG_ENDIAN are defined"
+#endif
+
+#endif /* _SPL_ISA_DEFS_H */
diff --git a/include/spl/sys/kmem.h b/include/spl/sys/kmem.h
new file mode 100644
index 000000000..d6b428551
--- /dev/null
+++ b/include/spl/sys/kmem.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KMEM_H
+#define _SPL_KMEM_H
+
+#include <sys/debug.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+extern int kmem_debugging(void);
+extern char *kmem_vasprintf(const char *fmt, va_list ap);
+extern char *kmem_asprintf(const char *fmt, ...);
+extern char *strdup(const char *str);
+extern void strfree(char *str);
+
+/*
+ * Memory allocation interfaces
+ */
+#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */
+#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */
+#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */
+#define KM_ZERO 0x1000 /* zero the allocation */
+#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */
+
+#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
+
+static int spl_fstrans_check(void);
+
+/*
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
+ */
+static inline gfp_t
+kmem_flags_convert(int flags)
+{
+ gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
+
+ if (flags & KM_NOSLEEP) {
+ lflags |= GFP_ATOMIC | __GFP_NORETRY;
+ } else {
+ lflags |= GFP_KERNEL;
+ if (spl_fstrans_check())
+ lflags &= ~(__GFP_IO|__GFP_FS);
+ }
+
+ if (flags & KM_PUSHPAGE)
+ lflags |= __GFP_HIGH;
+
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
+
+ return (lflags);
+}
+
+typedef struct {
+ struct task_struct *fstrans_thread;
+ unsigned int saved_flags;
+} fstrans_cookie_t;
+
+/*
+ * Introduced in Linux 3.9, however this cannot be solely relied on before
+ * Linux 3.18 as it doesn't turn off __GFP_FS as it should.
+ */
+#ifdef PF_MEMALLOC_NOIO
+#define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO)
+#else
+#define __SPL_PF_MEMALLOC_NOIO (0)
+#endif
+
+/*
+ * PF_FSTRANS is removed from Linux 4.12
+ */
+#ifdef PF_FSTRANS
+#define __SPL_PF_FSTRANS (PF_FSTRANS)
+#else
+#define __SPL_PF_FSTRANS (0)
+#endif
+
+#define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO)
+
+static inline fstrans_cookie_t
+spl_fstrans_mark(void)
+{
+ fstrans_cookie_t cookie;
+
+ BUILD_BUG_ON(SPL_FSTRANS == 0);
+
+ cookie.fstrans_thread = current;
+ cookie.saved_flags = current->flags & SPL_FSTRANS;
+ current->flags |= SPL_FSTRANS;
+
+ return (cookie);
+}
+
+static inline void
+spl_fstrans_unmark(fstrans_cookie_t cookie)
+{
+ ASSERT3P(cookie.fstrans_thread, ==, current);
+ ASSERT((current->flags & SPL_FSTRANS) == SPL_FSTRANS);
+
+ current->flags &= ~SPL_FSTRANS;
+ current->flags |= cookie.saved_flags;
+}
+
+static inline int
+spl_fstrans_check(void)
+{
+ return (current->flags & SPL_FSTRANS);
+}
+
+/*
+ * specifically used to check PF_FSTRANS flag, cannot be relied on for
+ * checking spl_fstrans_mark().
+ */
+static inline int
+__spl_pf_fstrans_check(void)
+{
+ return (current->flags & __SPL_PF_FSTRANS);
+}
+
+#ifdef HAVE_ATOMIC64_T
+#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
+#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
+#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used)
+#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size)
+extern atomic64_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+#else /* HAVE_ATOMIC64_T */
+#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
+#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
+#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
+#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size)
+extern atomic_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+#endif /* HAVE_ATOMIC64_T */
+
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
+
+#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))
+
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
+
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
+
+#endif /* _SPL_KMEM_H */
diff --git a/include/spl/sys/kmem_cache.h b/include/spl/sys/kmem_cache.h
new file mode 100644
index 000000000..8fa14f67e
--- /dev/null
+++ b/include/spl/sys/kmem_cache.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KMEM_CACHE_H
+#define _SPL_KMEM_CACHE_H
+
+#include <sys/taskq.h>
+
+/*
+ * Slab allocation interfaces. The SPL slab differs from the standard
+ * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
+ * allocated from the physical or virtal memory address space. The virtual
+ * slabs allow for good behavior when allocation large objects of identical
+ * size. This slab implementation also supports both constructors and
+ * destructors which the Linux slab does not.
+ */
+enum {
+ KMC_BIT_NOTOUCH = 0, /* Don't update ages */
+ KMC_BIT_NODEBUG = 1, /* Default behavior */
+ KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */
+ KMC_BIT_NOHASH = 3, /* XXX: Unsupported */
+ KMC_BIT_QCACHE = 4, /* XXX: Unsupported */
+ KMC_BIT_KMEM = 5, /* Use kmem cache */
+ KMC_BIT_VMEM = 6, /* Use vmem cache */
+ KMC_BIT_SLAB = 7, /* Use Linux slab cache */
+ KMC_BIT_OFFSLAB = 8, /* Objects not on slab */
+ KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */
+ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
+ KMC_BIT_GROWING = 15, /* Growing in progress */
+ KMC_BIT_REAPING = 16, /* Reaping in progress */
+ KMC_BIT_DESTROY = 17, /* Destroy in progress */
+ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
+ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */
+ KMC_BIT_MAX = 20, /* Proc handler helper bit */
+};
+
+/* kmem move callback return values */
+typedef enum kmem_cbrc {
+ KMEM_CBRC_YES = 0, /* Object moved */
+ KMEM_CBRC_NO = 1, /* Object not moved */
+ KMEM_CBRC_LATER = 2, /* Object not moved, try again later */
+ KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */
+ KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */
+} kmem_cbrc_t;
+
+#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH)
+#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG)
+#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE)
+#define KMC_NOHASH (1 << KMC_BIT_NOHASH)
+#define KMC_QCACHE (1 << KMC_BIT_QCACHE)
+#define KMC_KMEM (1 << KMC_BIT_KMEM)
+#define KMC_VMEM (1 << KMC_BIT_VMEM)
+#define KMC_SLAB (1 << KMC_BIT_SLAB)
+#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
+#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
+#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
+#define KMC_GROWING (1 << KMC_BIT_GROWING)
+#define KMC_REAPING (1 << KMC_BIT_REAPING)
+#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
+#define KMC_TOTAL (1 << KMC_BIT_TOTAL)
+#define KMC_ALLOC (1 << KMC_BIT_ALLOC)
+#define KMC_MAX (1 << KMC_BIT_MAX)
+
+#define KMC_REAP_CHUNK INT_MAX
+#define KMC_DEFAULT_SEEKS 1
+
+#define KMC_EXPIRE_AGE 0x1 /* Due to age */
+#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */
+
+#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */
+
+extern unsigned int spl_kmem_cache_expire;
+extern struct list_head spl_kmem_cache_list;
+extern struct rw_semaphore spl_kmem_cache_sem;
+
+#define SKM_MAGIC 0x2e2e2e2e
+#define SKO_MAGIC 0x20202020
+#define SKS_MAGIC 0x22222222
+#define SKC_MAGIC 0x2c2c2c2c
+
+#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
+#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */
+#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
+#ifdef _LP64
+#define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */
+#else
+#define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */
+#endif
+
+#define SPL_MAX_ORDER (MAX_ORDER - 3)
+#define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1))
+
+#ifdef CONFIG_SLUB
+#define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER
+#define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1))
+#else
+#define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT)
+#endif
+
+#define POINTER_IS_VALID(p) 0 /* Unimplemented */
+#define POINTER_INVALIDATE(pp) /* Unimplemented */
+
+typedef int (*spl_kmem_ctor_t)(void *, void *, int);
+typedef void (*spl_kmem_dtor_t)(void *, void *);
+typedef void (*spl_kmem_reclaim_t)(void *);
+
+typedef struct spl_kmem_magazine {
+ uint32_t skm_magic; /* Sanity magic */
+ uint32_t skm_avail; /* Available objects */
+ uint32_t skm_size; /* Magazine size */
+ uint32_t skm_refill; /* Batch refill size */
+ struct spl_kmem_cache *skm_cache; /* Owned by cache */
+ unsigned long skm_age; /* Last cache access */
+ unsigned int skm_cpu; /* Owned by cpu */
+ void *skm_objs[0]; /* Object pointers */
+} spl_kmem_magazine_t;
+
+typedef struct spl_kmem_obj {
+ uint32_t sko_magic; /* Sanity magic */
+ void *sko_addr; /* Buffer address */
+ struct spl_kmem_slab *sko_slab; /* Owned by slab */
+ struct list_head sko_list; /* Free object list linkage */
+} spl_kmem_obj_t;
+
+typedef struct spl_kmem_slab {
+ uint32_t sks_magic; /* Sanity magic */
+ uint32_t sks_objs; /* Objects per slab */
+ struct spl_kmem_cache *sks_cache; /* Owned by cache */
+ struct list_head sks_list; /* Slab list linkage */
+ struct list_head sks_free_list; /* Free object list */
+ unsigned long sks_age; /* Last modify jiffie */
+ uint32_t sks_ref; /* Ref count used objects */
+} spl_kmem_slab_t;
+
+typedef struct spl_kmem_alloc {
+ struct spl_kmem_cache *ska_cache; /* Owned by cache */
+ int ska_flags; /* Allocation flags */
+ taskq_ent_t ska_tqe; /* Task queue entry */
+} spl_kmem_alloc_t;
+
+typedef struct spl_kmem_emergency {
+ struct rb_node ske_node; /* Emergency tree linkage */
+ unsigned long ske_obj; /* Buffer address */
+} spl_kmem_emergency_t;
+
+typedef struct spl_kmem_cache {
+ uint32_t skc_magic; /* Sanity magic */
+ uint32_t skc_name_size; /* Name length */
+ char *skc_name; /* Name string */
+ spl_kmem_magazine_t **skc_mag; /* Per-CPU warm cache */
+ uint32_t skc_mag_size; /* Magazine size */
+ uint32_t skc_mag_refill; /* Magazine refill count */
+ spl_kmem_ctor_t skc_ctor; /* Constructor */
+ spl_kmem_dtor_t skc_dtor; /* Destructor */
+ spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */
+ void *skc_private; /* Private data */
+ void *skc_vmp; /* Unused */
+ struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */
+ unsigned long skc_flags; /* Flags */
+ uint32_t skc_obj_size; /* Object size */
+ uint32_t skc_obj_align; /* Object alignment */
+ uint32_t skc_slab_objs; /* Objects per slab */
+ uint32_t skc_slab_size; /* Slab size */
+ uint32_t skc_delay; /* Slab reclaim interval */
+ uint32_t skc_reap; /* Slab reclaim count */
+ atomic_t skc_ref; /* Ref count callers */
+ taskqid_t skc_taskqid; /* Slab reclaim task */
+ struct list_head skc_list; /* List of caches linkage */
+ struct list_head skc_complete_list; /* Completely alloc'ed */
+ struct list_head skc_partial_list; /* Partially alloc'ed */
+ struct rb_root skc_emergency_tree; /* Min sized objects */
+ spinlock_t skc_lock; /* Cache lock */
+ spl_wait_queue_head_t skc_waitq; /* Allocation waiters */
+ uint64_t skc_slab_fail; /* Slab alloc failures */
+ uint64_t skc_slab_create; /* Slab creates */
+ uint64_t skc_slab_destroy; /* Slab destroys */
+ uint64_t skc_slab_total; /* Slab total current */
+ uint64_t skc_slab_alloc; /* Slab alloc current */
+ uint64_t skc_slab_max; /* Slab max historic */
+ uint64_t skc_obj_total; /* Obj total current */
+ uint64_t skc_obj_alloc; /* Obj alloc current */
+ uint64_t skc_obj_max; /* Obj max historic */
+ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
+ uint64_t skc_obj_emergency; /* Obj emergency current */
+ uint64_t skc_obj_emergency_max; /* Obj emergency max */
+} spl_kmem_cache_t;
+#define kmem_cache_t spl_kmem_cache_t
+
+extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size,
+ size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor,
+ spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags);
+extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
+ kmem_cbrc_t (*)(void *, void *, size_t, void *));
+extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
+extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
+extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
+extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
+extern void spl_kmem_reap(void);
+
+#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \
+ spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
+#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
+#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
+#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
+#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
+#define kmem_cache_reap_now(skc) \
+ spl_kmem_cache_reap_now(skc, skc->skc_reap)
+#define kmem_reap() spl_kmem_reap()
+
+/*
+ * The following functions are only available for internal use.
+ */
+extern int spl_kmem_cache_init(void);
+extern void spl_kmem_cache_fini(void);
+
+#endif /* _SPL_KMEM_CACHE_H */
diff --git a/include/spl/sys/kobj.h b/include/spl/sys/kobj.h
new file mode 100644
index 000000000..558ec39a8
--- /dev/null
+++ b/include/spl/sys/kobj.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KOBJ_H
+#define _SPL_KOBJ_H
+
+#include <sys/vnode.h>
+
+typedef struct _buf {
+ vnode_t *vp;
+} _buf_t;
+
+typedef struct _buf buf_t;
+
+extern struct _buf *kobj_open_file(const char *name);
+extern void kobj_close_file(struct _buf *file);
+extern int kobj_read_file(struct _buf *file, char *buf, unsigned size,
+ unsigned off);
+extern int kobj_get_filesize(struct _buf *file, uint64_t *size);
+
+#endif /* SPL_KOBJ_H */
diff --git a/include/spl/sys/kstat.h b/include/spl/sys/kstat.h
new file mode 100644
index 000000000..9170fe24e
--- /dev/null
+++ b/include/spl/sys/kstat.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KSTAT_H
+#define _SPL_KSTAT_H
+
+#include <linux/module.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/kmem.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+
+#define KSTAT_STRLEN 255
+#define KSTAT_RAW_MAX (128*1024)
+
+/*
+ * For reference valid classes are:
+ * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc
+ */
+
+#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */
+#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */
+#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */
+#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */
+#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */
+#define KSTAT_NUM_TYPES 5
+
+#define KSTAT_DATA_CHAR 0
+#define KSTAT_DATA_INT32 1
+#define KSTAT_DATA_UINT32 2
+#define KSTAT_DATA_INT64 3
+#define KSTAT_DATA_UINT64 4
+#define KSTAT_DATA_LONG 5
+#define KSTAT_DATA_ULONG 6
+#define KSTAT_DATA_STRING 7
+#define KSTAT_NUM_DATAS 8
+
+#define KSTAT_INTR_HARD 0
+#define KSTAT_INTR_SOFT 1
+#define KSTAT_INTR_WATCHDOG 2
+#define KSTAT_INTR_SPURIOUS 3
+#define KSTAT_INTR_MULTSVC 4
+#define KSTAT_NUM_INTRS 5
+
+#define KSTAT_FLAG_VIRTUAL 0x01
+#define KSTAT_FLAG_VAR_SIZE 0x02
+#define KSTAT_FLAG_WRITABLE 0x04
+#define KSTAT_FLAG_PERSISTENT 0x08
+#define KSTAT_FLAG_DORMANT 0x10
+#define KSTAT_FLAG_UNSUPPORTED \
+ (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \
+ KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT)
+
+
+#define KS_MAGIC 0x9d9d9d9d
+
+/* Dynamic updates */
+#define KSTAT_READ 0
+#define KSTAT_WRITE 1
+
+struct kstat_s;
+typedef struct kstat_s kstat_t;
+
+typedef int kid_t; /* unique kstat id */
+typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */
+
+typedef struct kstat_module {
+ char ksm_name[KSTAT_STRLEN+1]; /* module name */
+ struct list_head ksm_module_list; /* module linkage */
+ struct list_head ksm_kstat_list; /* list of kstat entries */
+ struct proc_dir_entry *ksm_proc; /* proc entry */
+} kstat_module_t;
+
+typedef struct kstat_raw_ops {
+ int (*headers)(char *buf, size_t size);
+ int (*data)(char *buf, size_t size, void *data);
+ void *(*addr)(kstat_t *ksp, loff_t index);
+} kstat_raw_ops_t;
+
+struct kstat_s {
+ int ks_magic; /* magic value */
+ kid_t ks_kid; /* unique kstat ID */
+ hrtime_t ks_crtime; /* creation time */
+ hrtime_t ks_snaptime; /* last access time */
+ char ks_module[KSTAT_STRLEN+1]; /* provider module name */
+ int ks_instance; /* provider module instance */
+ char ks_name[KSTAT_STRLEN+1]; /* kstat name */
+ char ks_class[KSTAT_STRLEN+1]; /* kstat class */
+ uchar_t ks_type; /* kstat data type */
+ uchar_t ks_flags; /* kstat flags */
+ void *ks_data; /* kstat type-specific data */
+ uint_t ks_ndata; /* # of data records */
+ size_t ks_data_size; /* size of kstat data section */
+ struct proc_dir_entry *ks_proc; /* proc linkage */
+ kstat_update_t *ks_update; /* dynamic updates */
+ void *ks_private; /* private data */
+ kmutex_t ks_private_lock; /* kstat private data lock */
+ kmutex_t *ks_lock; /* kstat data lock */
+ struct list_head ks_list; /* kstat linkage */
+ kstat_module_t *ks_owner; /* kstat module linkage */
+ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */
+ char *ks_raw_buf; /* buf used for raw ops */
+ size_t ks_raw_bufsize; /* size of raw ops buffer */
+};
+
+typedef struct kstat_named_s {
+ char name[KSTAT_STRLEN]; /* name of counter */
+ uchar_t data_type; /* data type */
+ union {
+ char c[16]; /* 128-bit int */
+ int32_t i32; /* 32-bit signed int */
+ uint32_t ui32; /* 32-bit unsigned int */
+ int64_t i64; /* 64-bit signed int */
+ uint64_t ui64; /* 64-bit unsigned int */
+ long l; /* native signed long */
+ ulong_t ul; /* native unsigned long */
+ struct {
+ union {
+ char *ptr; /* NULL-term string */
+ char __pad[8]; /* 64-bit padding */
+ } addr;
+ uint32_t len; /* # bytes for strlen + '\0' */
+ } string;
+ } value;
+} kstat_named_t;
+
+#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr)
+#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len)
+
+typedef struct kstat_intr {
+ uint_t intrs[KSTAT_NUM_INTRS];
+} kstat_intr_t;
+
+typedef struct kstat_io {
+ u_longlong_t nread; /* number of bytes read */
+ u_longlong_t nwritten; /* number of bytes written */
+ uint_t reads; /* number of read operations */
+ uint_t writes; /* number of write operations */
+ hrtime_t wtime; /* cumulative wait (pre-service) time */
+ hrtime_t wlentime; /* cumulative wait len*time product */
+ hrtime_t wlastupdate; /* last time wait queue changed */
+ hrtime_t rtime; /* cumulative run (service) time */
+ hrtime_t rlentime; /* cumulative run length*time product */
+ hrtime_t rlastupdate; /* last time run queue changed */
+ uint_t wcnt; /* count of elements in wait state */
+ uint_t rcnt; /* count of elements in run state */
+} kstat_io_t;
+
+typedef struct kstat_timer {
+ char name[KSTAT_STRLEN+1]; /* event name */
+ u_longlong_t num_events; /* number of events */
+ hrtime_t elapsed_time; /* cumulative elapsed time */
+ hrtime_t min_time; /* shortest event duration */
+ hrtime_t max_time; /* longest event duration */
+ hrtime_t start_time; /* previous event start time */
+ hrtime_t stop_time; /* previous event stop time */
+} kstat_timer_t;
+
+int spl_kstat_init(void);
+void spl_kstat_fini(void);
+
+extern void __kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void* (*addr)(kstat_t *ksp, loff_t index));
+
+extern kstat_t *__kstat_create(const char *ks_module, int ks_instance,
+ const char *ks_name, const char *ks_class, uchar_t ks_type,
+ uint_t ks_ndata, uchar_t ks_flags);
+
+extern void __kstat_install(kstat_t *ksp);
+extern void __kstat_delete(kstat_t *ksp);
+extern void kstat_waitq_enter(kstat_io_t *);
+extern void kstat_waitq_exit(kstat_io_t *);
+extern void kstat_runq_enter(kstat_io_t *);
+extern void kstat_runq_exit(kstat_io_t *);
+
+#define kstat_set_raw_ops(k, h, d, a) \
+ __kstat_set_raw_ops(k, h, d, a)
+#define kstat_create(m, i, n, c, t, s, f) \
+ __kstat_create(m, i, n, c, t, s, f)
+
+#define kstat_install(k) __kstat_install(k)
+#define kstat_delete(k) __kstat_delete(k)
+
+#endif /* _SPL_KSTAT_H */
diff --git a/include/spl/sys/list.h b/include/spl/sys/list.h
new file mode 100644
index 000000000..74b784e93
--- /dev/null
+++ b/include/spl/sys/list.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_LIST_H
+#define _SPL_LIST_H
+
+#include <sys/types.h>
+#include <linux/list.h>
+
+/*
+ * NOTE: I have implemented the Solaris list API in terms of the native
+ * linux API. This has certain advantages in terms of leveraging the linux
+ * list debugging infrastructure, but it also means that the internals of a
+ * list differ slightly than on Solaris. This is not a problem as long as
+ * all callers stick to the published API. The two major differences are:
+ *
+ * 1) A list_node_t is mapped to a linux list_head struct which changes
+ * the name of the list_next/list_prev pointers to next/prev respectively.
+ *
+ * 2) A list_node_t which is not attached to a list on Solaris is denoted
+ * by having its list_next/list_prev pointers set to NULL. Under linux
+ * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2
+ * respectively. At this moment this only impacts the implementation
+ * of the list_link_init() and list_link_active() functions.
+ */
+
+typedef struct list_head list_node_t;
+
+typedef struct list {
+ size_t list_size;
+ size_t list_offset;
+ list_node_t list_head;
+} list_t;
+
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+
+static inline int
+list_is_empty(list_t *list)
+{
+ return (list_empty(&list->list_head));
+}
+
+static inline void
+list_link_init(list_node_t *node)
+{
+ node->next = LIST_POISON1;
+ node->prev = LIST_POISON2;
+}
+
+static inline void
+list_create(list_t *list, size_t size, size_t offset)
+{
+ list->list_size = size;
+ list->list_offset = offset;
+ INIT_LIST_HEAD(&list->list_head);
+}
+
+static inline void
+list_destroy(list_t *list)
+{
+ list_del(&list->list_head);
+}
+
+static inline void
+list_insert_head(list_t *list, void *object)
+{
+ list_add(list_d2l(list, object), &list->list_head);
+}
+
+static inline void
+list_insert_tail(list_t *list, void *object)
+{
+ list_add_tail(list_d2l(list, object), &list->list_head);
+}
+
+static inline void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL)
+ list_insert_head(list, nobject);
+ else
+ list_add(list_d2l(list, nobject), list_d2l(list, object));
+}
+
+static inline void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL)
+ list_insert_tail(list, nobject);
+ else
+ list_add_tail(list_d2l(list, nobject), list_d2l(list, object));
+}
+
+static inline void
+list_remove(list_t *list, void *object)
+{
+ list_del(list_d2l(list, object));
+}
+
+static inline void *
+list_remove_head(list_t *list)
+{
+ list_node_t *head = list->list_head.next;
+ if (head == &list->list_head)
+ return (NULL);
+
+ list_del(head);
+ return (list_object(list, head));
+}
+
+static inline void *
+list_remove_tail(list_t *list)
+{
+ list_node_t *tail = list->list_head.prev;
+ if (tail == &list->list_head)
+ return (NULL);
+
+ list_del(tail);
+ return (list_object(list, tail));
+}
+
+static inline void *
+list_head(list_t *list)
+{
+ if (list_is_empty(list))
+ return (NULL);
+
+ return (list_object(list, list->list_head.next));
+}
+
+static inline void *
+list_tail(list_t *list)
+{
+ if (list_is_empty(list))
+ return (NULL);
+
+ return (list_object(list, list->list_head.prev));
+}
+
+static inline void *
+list_next(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->next != &list->list_head)
+ return (list_object(list, node->next));
+
+ return (NULL);
+}
+
+static inline void *
+list_prev(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->prev != &list->list_head)
+ return (list_object(list, node->prev));
+
+ return (NULL);
+}
+
+static inline int
+list_link_active(list_node_t *node)
+{
+ return (node->next != LIST_POISON1) && (node->prev != LIST_POISON2);
+}
+
+static inline void
+spl_list_move_tail(list_t *dst, list_t *src)
+{
+ list_splice_init(&src->list_head, dst->list_head.prev);
+}
+
+#define list_move_tail(dst, src) spl_list_move_tail(dst, src)
+
+static inline void
+list_link_replace(list_node_t *old_node, list_node_t *new_node)
+{
+ new_node->next = old_node->next;
+ new_node->prev = old_node->prev;
+ old_node->prev->next = new_node;
+ old_node->next->prev = new_node;
+ list_link_init(old_node);
+}
+
+#endif /* SPL_LIST_H */
diff --git a/include/spl/sys/mode.h b/include/spl/sys/mode.h
new file mode 100644
index 000000000..02802d0d4
--- /dev/null
+++ b/include/spl/sys/mode.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_MODE_H
+#define _SPL_MODE_H
+
+#define IFTOVT(mode) vn_mode_to_vtype(mode)
+#define VTTOIF(vtype) vn_vtype_to_mode(vtype)
+#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT))
+
+#endif /* SPL_MODE_H */
diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h
new file mode 100644
index 000000000..f906d49d4
--- /dev/null
+++ b/include/spl/sys/mutex.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_MUTEX_H
+#define _SPL_MUTEX_H
+
+#include <sys/types.h>
+#include <linux/mutex.h>
+#include <linux/lockdep.h>
+
+typedef enum {
+ MUTEX_DEFAULT = 0,
+ MUTEX_SPIN = 1,
+ MUTEX_ADAPTIVE = 2,
+ MUTEX_NOLOCKDEP = 3
+} kmutex_type_t;
+
+typedef struct {
+ struct mutex m_mutex;
+ spinlock_t m_lock; /* used for serializing mutex_exit */
+ kthread_t *m_owner;
+#ifdef CONFIG_LOCKDEP
+ kmutex_type_t m_type;
+#endif /* CONFIG_LOCKDEP */
+} kmutex_t;
+
+#define MUTEX(mp) (&((mp)->m_mutex))
+
+static inline void
+spl_mutex_set_owner(kmutex_t *mp)
+{
+ mp->m_owner = current;
+}
+
+static inline void
+spl_mutex_clear_owner(kmutex_t *mp)
+{
+ mp->m_owner = NULL;
+}
+
+#define mutex_owner(mp) (ACCESS_ONCE((mp)->m_owner))
+#define mutex_owned(mp) (mutex_owner(mp) == current)
+#define MUTEX_HELD(mp) mutex_owned(mp)
+#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp))
+
+#ifdef CONFIG_LOCKDEP
+static inline void
+spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type)
+{
+ mp->m_type = type;
+}
+static inline void
+spl_mutex_lockdep_off_maybe(kmutex_t *mp) \
+{ \
+ if (mp && mp->m_type == MUTEX_NOLOCKDEP) \
+ lockdep_off(); \
+}
+static inline void
+spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
+{ \
+ if (mp && mp->m_type == MUTEX_NOLOCKDEP) \
+ lockdep_on(); \
+}
+#else /* CONFIG_LOCKDEP */
+#define spl_mutex_set_type(mp, type)
+#define spl_mutex_lockdep_off_maybe(mp)
+#define spl_mutex_lockdep_on_maybe(mp)
+#endif /* CONFIG_LOCKDEP */
+
+/*
+ * The following functions must be a #define and not static inline.
+ * This ensures that the native linux mutex functions (lock/unlock)
+ * will be correctly located in the users code which is important
+ * for the built in kernel lock analysis tools
+ */
+#undef mutex_init
+#define mutex_init(mp, name, type, ibc) \
+{ \
+ static struct lock_class_key __key; \
+ ASSERT(type == MUTEX_DEFAULT || type == MUTEX_NOLOCKDEP); \
+ \
+ __mutex_init(MUTEX(mp), (name) ? (#name) : (#mp), &__key); \
+ spin_lock_init(&(mp)->m_lock); \
+ spl_mutex_clear_owner(mp); \
+ spl_mutex_set_type(mp, type); \
+}
+
+#undef mutex_destroy
+#define mutex_destroy(mp) \
+{ \
+ VERIFY3P(mutex_owner(mp), ==, NULL); \
+}
+
+/* BEGIN CSTYLED */
+#define mutex_tryenter(mp) \
+({ \
+ int _rc_; \
+ \
+ spl_mutex_lockdep_off_maybe(mp); \
+ if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \
+ spl_mutex_set_owner(mp); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ \
+ _rc_; \
+})
+/* END CSTYLED */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define mutex_enter_nested(mp, subclass) \
+{ \
+ ASSERT3P(mutex_owner(mp), !=, current); \
+ spl_mutex_lockdep_off_maybe(mp); \
+ mutex_lock_nested(MUTEX(mp), (subclass)); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ spl_mutex_set_owner(mp); \
+}
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+#define mutex_enter_nested(mp, subclass) \
+{ \
+ ASSERT3P(mutex_owner(mp), !=, current); \
+ spl_mutex_lockdep_off_maybe(mp); \
+ mutex_lock(MUTEX(mp)); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ spl_mutex_set_owner(mp); \
+}
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+#define mutex_enter(mp) mutex_enter_nested((mp), 0)
+
+/*
+ * The reason for the spinlock:
+ *
+ * The Linux mutex is designed with a fast-path/slow-path design such that it
+ * does not guarantee serialization upon itself, allowing a race where latter
+ * acquirers finish mutex_unlock before former ones.
+ *
+ * The race renders it unsafe to be used for serializing the freeing of an
+ * object in which the mutex is embedded, where the latter acquirer could go
+ * on to free the object while the former one is still doing mutex_unlock and
+ * causing memory corruption.
+ *
+ * However, there are many places in ZFS where the mutex is used for
+ * serializing object freeing, and the code is shared among other OSes without
+ * this issue. Thus, we need the spinlock to force the serialization on
+ * mutex_exit().
+ *
+ * See http://lwn.net/Articles/575477/ for the information about the race.
+ */
+#define mutex_exit(mp) \
+{ \
+ spl_mutex_clear_owner(mp); \
+ spin_lock(&(mp)->m_lock); \
+ spl_mutex_lockdep_off_maybe(mp); \
+ mutex_unlock(MUTEX(mp)); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ spin_unlock(&(mp)->m_lock); \
+ /* NOTE: do not dereference mp after this point */ \
+}
+
+int spl_mutex_init(void);
+void spl_mutex_fini(void);
+
+#endif /* _SPL_MUTEX_H */
diff --git a/include/spl/sys/param.h b/include/spl/sys/param.h
new file mode 100644
index 000000000..4ef929151
--- /dev/null
+++ b/include/spl/sys/param.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_PARAM_H
+#define _SPL_PARAM_H
+
+#include <asm/page.h>
+
+/* Pages to bytes and back */
+#define ptob(pages) ((pages) << PAGE_SHIFT)
+#define btop(bytes) ((bytes) >> PAGE_SHIFT)
+
+#define MAXUID UINT32_MAX
+
+#endif /* SPL_PARAM_H */
diff --git a/include/spl/sys/proc.h b/include/spl/sys/proc.h
new file mode 100644
index 000000000..287683920
--- /dev/null
+++ b/include/spl/sys/proc.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_PROC_H
+#define _SPL_PROC_H
+
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry *proc_spl_kstat;
+
+int spl_proc_init(void);
+void spl_proc_fini(void);
+
+#endif /* SPL_PROC_H */
diff --git a/include/spl/sys/processor.h b/include/spl/sys/processor.h
new file mode 100644
index 000000000..a70101fa2
--- /dev/null
+++ b/include/spl/sys/processor.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_PROCESSOR_H
+#define _SPL_PROCESSOR_H
+
+#define getcpuid() smp_processor_id()
+
+typedef int processorid_t;
+
+#endif /* _SPL_PROCESSOR_H */
diff --git a/include/spl/sys/random.h b/include/spl/sys/random.h
new file mode 100644
index 000000000..93e244f56
--- /dev/null
+++ b/include/spl/sys/random.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RANDOM_H
+#define _SPL_RANDOM_H
+
+#include <linux/module.h>
+#include <linux/random.h>
+
+static __inline__ int
+random_get_bytes(uint8_t *ptr, size_t len)
+{
+ get_random_bytes((void *)ptr, (int)len);
+ return (0);
+}
+
+extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
+
+#endif /* _SPL_RANDOM_H */
diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h
new file mode 100644
index 000000000..b44ceab66
--- /dev/null
+++ b/include/spl/sys/rwlock.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RWLOCK_H
+#define _SPL_RWLOCK_H
+
+#include <sys/types.h>
+#include <linux/rwsem.h>
+
+/* Linux kernel compatibility */
+#if defined(CONFIG_PREEMPT_RT_FULL)
+#define SPL_RWSEM_SINGLE_READER_VALUE (1)
+#define SPL_RWSEM_SINGLE_WRITER_VALUE (0)
+#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+#define SPL_RWSEM_SINGLE_READER_VALUE (1)
+#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1)
+#else
+#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS)
+#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS)
+#endif
+
+/* Linux 3.16 changed activity to count for rwsem-spinlock */
+#if defined(CONFIG_PREEMPT_RT_FULL)
+#define RWSEM_COUNT(sem) sem->read_depth
+#elif defined(HAVE_RWSEM_ACTIVITY)
+#define RWSEM_COUNT(sem) sem->activity
+/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */
+#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
+#define RWSEM_COUNT(sem) atomic_long_read(&(sem)->count)
+#else
+#define RWSEM_COUNT(sem) sem->count
+#endif
+
+#if defined(RWSEM_SPINLOCK_IS_RAW)
+#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl)
+#define spl_rwsem_unlock_irqrestore(lk, fl) \
+ raw_spin_unlock_irqrestore(lk, fl)
+#define spl_rwsem_trylock_irqsave(lk, fl) raw_spin_trylock_irqsave(lk, fl)
+#else
+#define spl_rwsem_lock_irqsave(lk, fl) spin_lock_irqsave(lk, fl)
+#define spl_rwsem_unlock_irqrestore(lk, fl) spin_unlock_irqrestore(lk, fl)
+#define spl_rwsem_trylock_irqsave(lk, fl) spin_trylock_irqsave(lk, fl)
+#endif /* RWSEM_SPINLOCK_IS_RAW */
+
+#define spl_rwsem_is_locked(rwsem) rwsem_is_locked(rwsem)
+
+typedef enum {
+ RW_DRIVER = 2,
+ RW_DEFAULT = 4,
+ RW_NOLOCKDEP = 5
+} krw_type_t;
+
+typedef enum {
+ RW_NONE = 0,
+ RW_WRITER = 1,
+ RW_READER = 2
+} krw_t;
+
+/*
+ * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner
+ * field, so we don't need our own.
+ */
+typedef struct {
+ struct rw_semaphore rw_rwlock;
+#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
+ kthread_t *rw_owner;
+#endif
+#ifdef CONFIG_LOCKDEP
+ krw_type_t rw_type;
+#endif /* CONFIG_LOCKDEP */
+} krwlock_t;
+
+#define SEM(rwp) (&(rwp)->rw_rwlock)
+
+static inline void
+spl_rw_set_owner(krwlock_t *rwp)
+{
+/*
+ * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write,
+ * downgrade_write and __init_rwsem will set/clear owner for us.
+ */
+#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
+ rwp->rw_owner = current;
+#endif
+}
+
+static inline void
+spl_rw_clear_owner(krwlock_t *rwp)
+{
+#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
+ rwp->rw_owner = NULL;
+#endif
+}
+
+static inline kthread_t *
+rw_owner(krwlock_t *rwp)
+{
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ return (SEM(rwp)->owner);
+#else
+ return (rwp->rw_owner);
+#endif
+}
+
+#ifdef CONFIG_LOCKDEP
+static inline void
+spl_rw_set_type(krwlock_t *rwp, krw_type_t type)
+{
+ rwp->rw_type = type;
+}
+static inline void
+spl_rw_lockdep_off_maybe(krwlock_t *rwp) \
+{ \
+ if (rwp && rwp->rw_type == RW_NOLOCKDEP) \
+ lockdep_off(); \
+}
+static inline void
+spl_rw_lockdep_on_maybe(krwlock_t *rwp) \
+{ \
+ if (rwp && rwp->rw_type == RW_NOLOCKDEP) \
+ lockdep_on(); \
+}
+#else /* CONFIG_LOCKDEP */
+#define spl_rw_set_type(rwp, type)
+#define spl_rw_lockdep_off_maybe(rwp)
+#define spl_rw_lockdep_on_maybe(rwp)
+#endif /* CONFIG_LOCKDEP */
+
+static inline int
+RW_READ_HELD(krwlock_t *rwp)
+{
+ /*
+ * Linux 4.8 will set owner to 1 when read held instead of leave it
+ * NULL. So we check whether owner <= 1.
+ */
+ return (spl_rwsem_is_locked(SEM(rwp)) &&
+ (unsigned long)rw_owner(rwp) <= 1);
+}
+
+static inline int
+RW_WRITE_HELD(krwlock_t *rwp)
+{
+ return (rw_owner(rwp) == current);
+}
+
+static inline int
+RW_LOCK_HELD(krwlock_t *rwp)
+{
+ return (spl_rwsem_is_locked(SEM(rwp)));
+}
+
+/*
+ * The following functions must be a #define and not static inline.
+ * This ensures that the native linux semaphore functions (down/up)
+ * will be correctly located in the users code which is important
+ * for the built in kernel lock analysis tools
+ */
+/* BEGIN CSTYLED */
+#define rw_init(rwp, name, type, arg) \
+({ \
+ static struct lock_class_key __key; \
+ ASSERT(type == RW_DEFAULT || type == RW_NOLOCKDEP); \
+ \
+ __init_rwsem(SEM(rwp), #rwp, &__key); \
+ spl_rw_clear_owner(rwp); \
+ spl_rw_set_type(rwp, type); \
+})
+
+#define rw_destroy(rwp) \
+({ \
+ VERIFY(!RW_LOCK_HELD(rwp)); \
+})
+
+#define rw_tryenter(rwp, rw) \
+({ \
+ int _rc_ = 0; \
+ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ switch (rw) { \
+ case RW_READER: \
+ _rc_ = down_read_trylock(SEM(rwp)); \
+ break; \
+ case RW_WRITER: \
+ if ((_rc_ = down_write_trylock(SEM(rwp)))) \
+ spl_rw_set_owner(rwp); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+ spl_rw_lockdep_on_maybe(rwp); \
+ _rc_; \
+})
+
+#define rw_enter(rwp, rw) \
+({ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ switch (rw) { \
+ case RW_READER: \
+ down_read(SEM(rwp)); \
+ break; \
+ case RW_WRITER: \
+ down_write(SEM(rwp)); \
+ spl_rw_set_owner(rwp); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+ spl_rw_lockdep_on_maybe(rwp); \
+})
+
+#define rw_exit(rwp) \
+({ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ if (RW_WRITE_HELD(rwp)) { \
+ spl_rw_clear_owner(rwp); \
+ up_write(SEM(rwp)); \
+ } else { \
+ ASSERT(RW_READ_HELD(rwp)); \
+ up_read(SEM(rwp)); \
+ } \
+ spl_rw_lockdep_on_maybe(rwp); \
+})
+
+#define rw_downgrade(rwp) \
+({ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ spl_rw_clear_owner(rwp); \
+ downgrade_write(SEM(rwp)); \
+ spl_rw_lockdep_on_maybe(rwp); \
+})
+
+#define rw_tryupgrade(rwp) \
+({ \
+ int _rc_ = 0; \
+ \
+ if (RW_WRITE_HELD(rwp)) { \
+ _rc_ = 1; \
+ } else { \
+ spl_rw_lockdep_off_maybe(rwp); \
+ if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \
+ spl_rw_set_owner(rwp); \
+ spl_rw_lockdep_on_maybe(rwp); \
+ } \
+ _rc_; \
+})
+/* END CSTYLED */
+
+int spl_rw_init(void);
+void spl_rw_fini(void);
+int rwsem_tryupgrade(struct rw_semaphore *rwsem);
+
+#endif /* _SPL_RWLOCK_H */
diff --git a/include/spl/sys/shrinker.h b/include/spl/sys/shrinker.h
new file mode 100644
index 000000000..28c1fa78c
--- /dev/null
+++ b/include/spl/sys/shrinker.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SHRINKER_H
+#define _SPL_SHRINKER_H
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#if !defined(HAVE_SHRINK_CONTROL_STRUCT)
+struct shrink_control {
+ gfp_t gfp_mask;
+ unsigned long nr_to_scan;
+};
+#endif /* HAVE_SHRINK_CONTROL_STRUCT */
+
+/*
+ * Due to frequent changes in the shrinker API the following
+ * compatibility wrappers should be used. They are as follows:
+ *
+ * SPL_SHRINKER_DECLARE is used to declare the shrinker which is
+ * passed to spl_register_shrinker()/spl_unregister_shrinker(). Use
+ * shrinker_name to set the shrinker variable name, shrinker_callback
+ * to set the callback function, and seek_cost to define the cost of
+ * reclaiming an object.
+ *
+ * SPL_SHRINKER_DECLARE(shrinker_name, shrinker_callback, seek_cost);
+ *
+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE is used when a forward declaration
+ * of the shrinker callback function is required. Only the callback
+ * function needs to be passed.
+ *
+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(shrinker_callback);
+ *
+ * SPL_SHRINKER_CALLBACK_WRAPPER is used to declare the callback function
+ * which is registered with the shrinker. This function will call your
+ * custom shrinker which must use the following prototype. Notice the
+ * leading __'s, these must be appended to the callback_function name.
+ *
+ * int __shrinker_callback(struct shrinker *, struct shrink_control *)
+ * SPL_SHRINKER_CALLBACK_WRAPPER(shrinker_callback);a
+ *
+ *
+ * Example:
+ *
+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(my_shrinker_fn);
+ * SPL_SHRINKER_DECLARE(my_shrinker, my_shrinker_fn, 1);
+ *
+ * static int
+ * __my_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc)
+ * {
+ * if (sc->nr_to_scan) {
+ * ...scan objects in the cache and reclaim them...
+ * }
+ *
+ * ...calculate number of objects in the cache...
+ *
+ * return (number of objects in the cache);
+ * }
+ * SPL_SHRINKER_CALLBACK_WRAPPER(my_shrinker_fn);
+ */
+
+#define spl_register_shrinker(x) register_shrinker(x)
+#define spl_unregister_shrinker(x) unregister_shrinker(x)
+
+/*
+ * Linux 2.6.23 - 2.6.34 Shrinker API Compatibility.
+ */
+#if defined(HAVE_2ARGS_OLD_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .shrink = x, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static int fn(int nr_to_scan, unsigned int gfp_mask)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static int \
+fn(int nr_to_scan, unsigned int gfp_mask) \
+{ \
+ struct shrink_control sc; \
+ \
+ sc.nr_to_scan = nr_to_scan; \
+ sc.gfp_mask = gfp_mask; \
+ \
+ return (__ ## fn(NULL, &sc)); \
+}
+
+/*
+ * Linux 2.6.35 to 2.6.39 Shrinker API Compatibility.
+ */
+#elif defined(HAVE_3ARGS_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .shrink = x, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static int fn(struct shrinker *, int, unsigned int)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static int \
+fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \
+{ \
+ struct shrink_control sc; \
+ \
+ sc.nr_to_scan = nr_to_scan; \
+ sc.gfp_mask = gfp_mask; \
+ \
+ return (__ ## fn(shrink, &sc)); \
+}
+
+/*
+ * Linux 3.0 to 3.11 Shrinker API Compatibility.
+ */
+#elif defined(HAVE_2ARGS_NEW_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .shrink = x, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static int fn(struct shrinker *, struct shrink_control *)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static int \
+fn(struct shrinker *shrink, struct shrink_control *sc) \
+{ \
+ return (__ ## fn(shrink, sc)); \
+}
+
+/*
+ * Linux 3.12 and later Shrinker API Compatibility.
+ */
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .count_objects = x ## _count_objects, \
+ .scan_objects = x ## _scan_objects, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static unsigned long fn ## _count_objects(struct shrinker *, \
+ struct shrink_control *); \
+static unsigned long fn ## _scan_objects(struct shrinker *, \
+ struct shrink_control *)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static unsigned long \
+fn ## _count_objects(struct shrinker *shrink, struct shrink_control *sc)\
+{ \
+ int __ret__; \
+ \
+ sc->nr_to_scan = 0; \
+ __ret__ = __ ## fn(NULL, sc); \
+ \
+ /* Errors may not be returned and must be converted to zeros */ \
+ return ((__ret__ < 0) ? 0 : __ret__); \
+} \
+ \
+static unsigned long \
+fn ## _scan_objects(struct shrinker *shrink, struct shrink_control *sc) \
+{ \
+ int __ret__; \
+ \
+ __ret__ = __ ## fn(NULL, sc); \
+ return ((__ret__ < 0) ? SHRINK_STOP : __ret__); \
+}
+#else
+/*
+ * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced.
+ */
+#error "Unknown shrinker callback"
+#endif
+
+#if defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+typedef unsigned long spl_shrinker_t;
+#else
+typedef int spl_shrinker_t;
+#define SHRINK_STOP (-1)
+#endif
+
+#endif /* SPL_SHRINKER_H */
diff --git a/include/spl/sys/sid.h b/include/spl/sys/sid.h
new file mode 100644
index 000000000..731b62c47
--- /dev/null
+++ b/include/spl/sys/sid.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SID_H
+#define _SPL_SID_H
+
+typedef struct ksiddomain {
+ char *kd_name;
+} ksiddomain_t;
+
+typedef enum ksid_index {
+ KSID_USER,
+ KSID_GROUP,
+ KSID_OWNER,
+ KSID_COUNT
+} ksid_index_t;
+
+typedef int ksid_t;
+
+static inline ksiddomain_t *
+ksid_lookupdomain(const char *dom)
+{
+ ksiddomain_t *kd;
+ int len = strlen(dom);
+
+ kd = kmem_zalloc(sizeof (ksiddomain_t), KM_SLEEP);
+ kd->kd_name = kmem_zalloc(len + 1, KM_SLEEP);
+ memcpy(kd->kd_name, dom, len);
+
+ return (kd);
+}
+
+static inline void
+ksiddomain_rele(ksiddomain_t *ksid)
+{
+ kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1);
+ kmem_free(ksid, sizeof (ksiddomain_t));
+}
+
+#endif /* _SPL_SID_H */
diff --git a/include/spl/sys/signal.h b/include/spl/sys/signal.h
new file mode 100644
index 000000000..36b8b5d98
--- /dev/null
+++ b/include/spl/sys/signal.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SIGNAL_H
+#define _SPL_SIGNAL_H
+
+#include <linux/sched.h>
+
+#ifdef HAVE_SCHED_SIGNAL_HEADER
+#include <linux/sched/signal.h>
+#endif
+
+#define FORREAL 0 /* Usual side-effects */
+#define JUSTLOOKING 1 /* Don't stop the process */
+
+/*
+ * The "why" argument indicates the allowable side-effects of the call:
+ *
+ * FORREAL: Extract the next pending signal from p_sig into p_cursig;
+ * stop the process if a stop has been requested or if a traced signal
+ * is pending.
+ *
+ * JUSTLOOKING: Don't stop the process, just indicate whether or not
+ * a signal might be pending (FORREAL is needed to tell for sure).
+ */
+static __inline__ int
+issig(int why)
+{
+ ASSERT(why == FORREAL || why == JUSTLOOKING);
+
+ return (signal_pending(current));
+}
+
+#endif /* SPL_SIGNAL_H */
diff --git a/include/spl/sys/stat.h b/include/spl/sys/stat.h
new file mode 100644
index 000000000..83018e894
--- /dev/null
+++ b/include/spl/sys/stat.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_STAT_H
+#define _SPL_STAT_H
+
+#include <linux/stat.h>
+
+#endif /* SPL_STAT_H */
diff --git a/include/spl/sys/strings.h b/include/spl/sys/strings.h
new file mode 100644
index 000000000..4fb803206
--- /dev/null
+++ b/include/spl/sys/strings.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2018 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _SPL_SYS_STRINGS_H
+#define _SPL_SYS_STRINGS_H
+
+#include <linux/string.h>
+
+#define bzero(ptr, size) memset(ptr, 0, size)
+#define bcopy(src, dest, size) memmove(dest, src, size)
+#define bcmp(src, dest, size) memcmp((src), (dest), (size_t)(size))
+
+#endif /* _SPL_SYS_STRINGS_H */
diff --git a/include/spl/sys/sunddi.h b/include/spl/sys/sunddi.h
new file mode 100644
index 000000000..29a6fe00d
--- /dev/null
+++ b/include/spl/sys/sunddi.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SUNDDI_H
+#define _SPL_SUNDDI_H
+
+#include <sys/cred.h>
+#include <sys/uio.h>
+#include <sys/mutex.h>
+#include <sys/u8_textprep.h>
+#include <sys/vnode.h>
+
+typedef int ddi_devid_t;
+
+#define DDI_DEV_T_NONE ((dev_t)-1)
+#define DDI_DEV_T_ANY ((dev_t)-2)
+#define DI_MAJOR_T_UNKNOWN ((major_t)0)
+
+#define DDI_PROP_DONTPASS 0x0001
+#define DDI_PROP_CANSLEEP 0x0002
+
+#define DDI_SUCCESS 0
+#define DDI_FAILURE -1
+
+#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL)
+#define ddi_prop_free(x) (void)0
+#define ddi_root_node() (void)0
+
+extern int ddi_strtoul(const char *, char **, int, unsigned long *);
+extern int ddi_strtol(const char *, char **, int, long *);
+extern int ddi_strtoull(const char *, char **, int, unsigned long long *);
+extern int ddi_strtoll(const char *, char **, int, long long *);
+
+extern int ddi_copyin(const void *from, void *to, size_t len, int flags);
+extern int ddi_copyout(const void *from, void *to, size_t len, int flags);
+
+#endif /* SPL_SUNDDI_H */
diff --git a/include/spl/sys/sysmacros.h b/include/spl/sys/sysmacros.h
new file mode 100644
index 000000000..839e7fd8c
--- /dev/null
+++ b/include/spl/sys/sysmacros.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SYSMACROS_H
+#define _SPL_SYSMACROS_H
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <sys/debug.h>
+#include <sys/zone.h>
+#include <sys/signal.h>
+#include <asm/page.h>
+
+#ifdef HAVE_SCHED_RT_HEADER
+#include <linux/sched/rt.h>
+#endif
+
+#ifndef _KERNEL
+#define _KERNEL __KERNEL__
+#endif
+
+#define FALSE 0
+#define TRUE 1
+
+#define INT8_MAX (127)
+#define INT8_MIN (-128)
+#define UINT8_MAX (255)
+#define UINT8_MIN (0)
+
+#define INT16_MAX (32767)
+#define INT16_MIN (-32768)
+#define UINT16_MAX (65535)
+#define UINT16_MIN (0)
+
+#define INT32_MAX INT_MAX
+#define INT32_MIN INT_MIN
+#define UINT32_MAX UINT_MAX
+#define UINT32_MIN UINT_MIN
+
+#define INT64_MAX LLONG_MAX
+#define INT64_MIN LLONG_MIN
+#define UINT64_MAX ULLONG_MAX
+#define UINT64_MIN ULLONG_MIN
+
+#define NBBY 8
+
+#define MAXMSGLEN 256
+#define MAXNAMELEN 256
+#define MAXPATHLEN 4096
+#define MAXOFFSET_T LLONG_MAX
+#define MAXBSIZE 8192
+#define DEV_BSIZE 512
+#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */
+
+#define proc_pageout NULL
+#define curproc current
+#define max_ncpus num_possible_cpus()
+#define boot_ncpus num_online_cpus()
+#define CPU_SEQID smp_processor_id()
+#define _NOTE(x)
+#define is_system_labeled() 0
+
+#ifndef RLIM64_INFINITY
+#define RLIM64_INFINITY (~0ULL)
+#endif
+
+/*
+ * 0..MAX_PRIO-1: Process priority
+ * 0..MAX_RT_PRIO-1: RT priority tasks
+ * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks
+ *
+ * Treat shim tasks as SCHED_NORMAL tasks
+ */
+#define minclsyspri (MAX_PRIO-1)
+#define maxclsyspri (MAX_RT_PRIO)
+#define defclsyspri (DEFAULT_PRIO)
+
+#ifndef NICE_TO_PRIO
+#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
+#endif
+#ifndef PRIO_TO_NICE
+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
+#endif
+
+/*
+ * Missing macros
+ */
+#ifndef PAGESIZE
+#define PAGESIZE PAGE_SIZE
+#endif
+
+#ifndef PAGESHIFT
+#define PAGESHIFT PAGE_SHIFT
+#endif
+
+/* Dtrace probes do not exist in the linux kernel */
+#ifdef DTRACE_PROBE
+#undef DTRACE_PROBE
+#endif /* DTRACE_PROBE */
+#define DTRACE_PROBE(a) ((void)0)
+
+#ifdef DTRACE_PROBE1
+#undef DTRACE_PROBE1
+#endif /* DTRACE_PROBE1 */
+#define DTRACE_PROBE1(a, b, c) ((void)0)
+
+#ifdef DTRACE_PROBE2
+#undef DTRACE_PROBE2
+#endif /* DTRACE_PROBE2 */
+#define DTRACE_PROBE2(a, b, c, d, e) ((void)0)
+
+#ifdef DTRACE_PROBE3
+#undef DTRACE_PROBE3
+#endif /* DTRACE_PROBE3 */
+#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0)
+
+#ifdef DTRACE_PROBE4
+#undef DTRACE_PROBE4
+#endif /* DTRACE_PROBE4 */
+#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0)
+
+/* Missing globals */
+extern char spl_version[32];
+extern unsigned long spl_hostid;
+
+/* Missing misc functions */
+extern uint32_t zone_get_hostid(void *zone);
+extern void spl_setup(void);
+extern void spl_cleanup(void);
+
+#define highbit(x) __fls(x)
+#define lowbit(x) __ffs(x)
+
+#define highbit64(x) fls64(x)
+#define makedevice(maj, min) makedev(maj, min)
+
+/* common macros */
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) < 0 ? -(a) : (a))
+#endif
+#ifndef DIV_ROUND_UP
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#endif
+#ifndef roundup
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#endif
+#ifndef howmany
+#define howmany(x, y) (((x) + ((y) - 1)) / (y))
+#endif
+
+/*
+ * Compatibility macros/typedefs needed for Solaris -> Linux port
+ */
+#define P2ALIGN(x, align) ((x) & -(align))
+#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
+#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1)
+#define P2PHASE(x, align) ((x) & ((align) - 1))
+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
+#define ISP2(x) (((x) & ((x) - 1)) == 0)
+#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
+#define P2BOUNDARY(off, len, align) \
+ (((off) ^ ((off) + (len) - 1)) > (align) - 1)
+
+/*
+ * Typed version of the P2* macros. These macros should be used to ensure
+ * that the result is correctly calculated based on the data type of (x),
+ * which is passed in as the last argument, regardless of the data
+ * type of the alignment. For example, if (x) is of type uint64_t,
+ * and we want to round it up to a page boundary using "PAGESIZE" as
+ * the alignment, we can do either
+ *
+ * P2ROUNDUP(x, (uint64_t)PAGESIZE)
+ * or
+ * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
+ */
+#define P2ALIGN_TYPED(x, align, type) \
+ ((type)(x) & -(type)(align))
+#define P2PHASE_TYPED(x, align, type) \
+ ((type)(x) & ((type)(align) - 1))
+#define P2NPHASE_TYPED(x, align, type) \
+ (-(type)(x) & ((type)(align) - 1))
+#define P2ROUNDUP_TYPED(x, align, type) \
+ ((((type)(x) - 1) | ((type)(align) - 1)) + 1)
+#define P2END_TYPED(x, align, type) \
+ (-(~(type)(x) & -(type)(align)))
+#define P2PHASEUP_TYPED(x, align, phase, type) \
+ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
+#define P2CROSS_TYPED(x, y, align, type) \
+ (((type)(x) ^ (type)(y)) > (type)(align) - 1)
+#define P2SAMEHIGHBIT_TYPED(x, y, type) \
+ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
+
+#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof)
+
+/* avoid any possibility of clashing with <stddef.h> version */
+
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+
+#endif /* _SPL_SYSMACROS_H */
diff --git a/include/spl/sys/systeminfo.h b/include/spl/sys/systeminfo.h
new file mode 100644
index 000000000..225569158
--- /dev/null
+++ b/include/spl/sys/systeminfo.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SYSTEMINFO_H
+#define _SPL_SYSTEMINFO_H
+
+#define HW_HOSTID_LEN 11 /* minimum buffer size needed */
+ /* to hold a decimal or hex */
+ /* hostid string */
+
+/* Supplemental definitions for Linux. */
+#define HW_HOSTID_PATH "/etc/hostid" /* binary configuration file */
+#define HW_HOSTID_MASK 0xFFFFFFFF /* significant hostid bits */
+
+#endif /* SPL_SYSTEMINFO_H */
diff --git a/include/spl/sys/taskq.h b/include/spl/sys/taskq.h
new file mode 100644
index 000000000..7353367a2
--- /dev/null
+++ b/include/spl/sys/taskq.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TASKQ_H
+#define _SPL_TASKQ_H
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/rwlock.h>
+#include <sys/wait.h>
+
+#define TASKQ_NAMELEN 31
+
+#define TASKQ_PREPOPULATE 0x00000001
+#define TASKQ_CPR_SAFE 0x00000002
+#define TASKQ_DYNAMIC 0x00000004
+#define TASKQ_THREADS_CPU_PCT 0x00000008
+#define TASKQ_DC_BATCH 0x00000010
+#define TASKQ_ACTIVE 0x80000000
+
+/*
+ * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
+ * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly
+ * large so as not to conflict with already used GFP_* defines.
+ */
+#define TQ_SLEEP 0x00000000
+#define TQ_NOSLEEP 0x00000001
+#define TQ_PUSHPAGE 0x00000002
+#define TQ_NOQUEUE 0x01000000
+#define TQ_NOALLOC 0x02000000
+#define TQ_NEW 0x04000000
+#define TQ_FRONT 0x08000000
+
+/*
+ * Reserved taskqid values.
+ */
+#define TASKQID_INVALID ((taskqid_t)0)
+#define TASKQID_INITIAL ((taskqid_t)1)
+
+/*
+ * spin_lock(lock) and spin_lock_nested(lock,0) are equivalent,
+ * so TQ_LOCK_DYNAMIC must not evaluate to 0
+ */
+typedef enum tq_lock_role {
+ TQ_LOCK_GENERAL = 0,
+ TQ_LOCK_DYNAMIC = 1,
+} tq_lock_role_t;
+
+typedef unsigned long taskqid_t;
+typedef void (task_func_t)(void *);
+
+typedef struct taskq {
+ spinlock_t tq_lock; /* protects taskq_t */
+ char *tq_name; /* taskq name */
+ int tq_instance; /* instance of tq_name */
+ struct list_head tq_thread_list; /* list of all threads */
+ struct list_head tq_active_list; /* list of active threads */
+ int tq_nactive; /* # of active threads */
+ int tq_nthreads; /* # of existing threads */
+ int tq_nspawn; /* # of threads being spawned */
+ int tq_maxthreads; /* # of threads maximum */
+ int tq_pri; /* priority */
+ int tq_minalloc; /* min taskq_ent_t pool size */
+ int tq_maxalloc; /* max taskq_ent_t pool size */
+ int tq_nalloc; /* cur taskq_ent_t pool size */
+ uint_t tq_flags; /* flags */
+ taskqid_t tq_next_id; /* next pend/work id */
+ taskqid_t tq_lowest_id; /* lowest pend/work id */
+ struct list_head tq_free_list; /* free taskq_ent_t's */
+ struct list_head tq_pend_list; /* pending taskq_ent_t's */
+ struct list_head tq_prio_list; /* priority taskq_ent_t's */
+ struct list_head tq_delay_list; /* delayed taskq_ent_t's */
+ struct list_head tq_taskqs; /* all taskq_t's */
+ spl_wait_queue_head_t tq_work_waitq; /* new work waitq */
+ spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */
+ tq_lock_role_t tq_lock_class; /* class when taking tq_lock */
+} taskq_t;
+
+typedef struct taskq_ent {
+ spinlock_t tqent_lock;
+ spl_wait_queue_head_t tqent_waitq;
+ struct timer_list tqent_timer;
+ struct list_head tqent_list;
+ taskqid_t tqent_id;
+ task_func_t *tqent_func;
+ void *tqent_arg;
+ taskq_t *tqent_taskq;
+ uintptr_t tqent_flags;
+ unsigned long tqent_birth;
+} taskq_ent_t;
+
+#define TQENT_FLAG_PREALLOC 0x1
+#define TQENT_FLAG_CANCEL 0x2
+
+typedef struct taskq_thread {
+ struct list_head tqt_thread_list;
+ struct list_head tqt_active_list;
+ struct task_struct *tqt_thread;
+ taskq_t *tqt_tq;
+ taskqid_t tqt_id;
+ taskq_ent_t *tqt_task;
+ uintptr_t tqt_flags;
+} taskq_thread_t;
+
+/* Global system-wide dynamic task queue available for all consumers */
+extern taskq_t *system_taskq;
+/* Global dynamic task queue for long delay */
+extern taskq_t *system_delay_taskq;
+
+/* List of all taskqs */
+extern struct list_head tq_list;
+extern struct rw_semaphore tq_list_sem;
+
+extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *,
+ uint_t, clock_t);
+extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+ taskq_ent_t *);
+extern int taskq_empty_ent(taskq_ent_t *);
+extern void taskq_init_ent(taskq_ent_t *);
+extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern void taskq_destroy(taskq_t *);
+extern void taskq_wait_id(taskq_t *, taskqid_t);
+extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
+extern void taskq_wait(taskq_t *);
+extern int taskq_cancel_id(taskq_t *, taskqid_t);
+extern int taskq_member(taskq_t *, kthread_t *);
+
+#define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \
+ taskq_create(name, nthreads, pri, min, max, flags)
+#define taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \
+ taskq_create(name, nthreads, maxclsyspri, min, max, flags)
+
+int spl_taskq_init(void);
+void spl_taskq_fini(void);
+
+#endif /* _SPL_TASKQ_H */
diff --git a/include/spl/sys/thread.h b/include/spl/sys/thread.h
new file mode 100644
index 000000000..3762717da
--- /dev/null
+++ b/include/spl/sys/thread.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_THREAD_H
+#define _SPL_THREAD_H
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+#define TP_MAGIC 0x53535353
+
+#define TS_SLEEP TASK_INTERRUPTIBLE
+#define TS_RUN TASK_RUNNING
+#define TS_ZOMB EXIT_ZOMBIE
+#define TS_STOPPED TASK_STOPPED
+
+typedef void (*thread_func_t)(void *);
+
+/* BEGIN CSTYLED */
+#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
+ __thread_create(stk, stksize, (thread_func_t)func, \
+ #func, arg, len, pp, state, pri)
+/* END CSTYLED */
+
+#define thread_exit() __thread_exit()
+#define thread_join(t) VERIFY(0)
+#define curthread current
+#define getcomm() current->comm
+#define getpid() current->pid
+
+extern kthread_t *__thread_create(caddr_t stk, size_t stksize,
+ thread_func_t func, const char *name, void *args, size_t len, proc_t *pp,
+ int state, pri_t pri);
+extern void __thread_exit(void);
+extern struct task_struct *spl_kthread_create(int (*func)(void *),
+ void *data, const char namefmt[], ...);
+
+extern proc_t p0;
+
+#endif /* _SPL_THREAD_H */
diff --git a/include/spl/sys/time.h b/include/spl/sys/time.h
new file mode 100644
index 000000000..d6aaca913
--- /dev/null
+++ b/include/spl/sys/time.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TIME_H
+#define _SPL_TIME_H
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <sys/types.h>
+#include <sys/timer.h>
+
+#if defined(CONFIG_64BIT)
+#define TIME_MAX INT64_MAX
+#define TIME_MIN INT64_MIN
+#else
+#define TIME_MAX INT32_MAX
+#define TIME_MIN INT32_MIN
+#endif
+
+#define SEC 1
+#define MILLISEC 1000
+#define MICROSEC 1000000
+#define NANOSEC 1000000000
+
+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
+
+#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
+#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
+
+#define NSEC2SEC(n) ((n) / (NANOSEC / SEC))
+#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC))
+
+static const int hz = HZ;
+
+#define TIMESPEC_OVERFLOW(ts) \
+ ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX)
+
+static inline void
+gethrestime(timestruc_t *now)
+{
+ *now = current_kernel_time();
+}
+
+static inline time_t
+gethrestime_sec(void)
+{
+ struct timespec ts;
+ ts = current_kernel_time();
+ return (ts.tv_sec);
+}
+
+static inline hrtime_t
+gethrtime(void)
+{
+ struct timespec now;
+ getrawmonotonic(&now);
+ return (((hrtime_t)now.tv_sec * NSEC_PER_SEC) + now.tv_nsec);
+}
+
+#endif /* _SPL_TIME_H */
diff --git a/include/spl/sys/timer.h b/include/spl/sys/timer.h
new file mode 100644
index 000000000..a6b134570
--- /dev/null
+++ b/include/spl/sys/timer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TIMER_H
+#define _SPL_TIMER_H
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+#define lbolt ((clock_t)jiffies)
+#define lbolt64 ((int64_t)get_jiffies_64())
+
+#define ddi_get_lbolt() ((clock_t)jiffies)
+#define ddi_get_lbolt64() ((int64_t)get_jiffies_64())
+
+#define ddi_time_before(a, b) (typecheck(clock_t, a) && \
+ typecheck(clock_t, b) && \
+ ((a) - (b) < 0))
+#define ddi_time_after(a, b) ddi_time_before(b, a)
+#define ddi_time_before_eq(a, b) (!ddi_time_after(a, b))
+#define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a)
+
+#define ddi_time_before64(a, b) (typecheck(int64_t, a) && \
+ typecheck(int64_t, b) && \
+ ((a) - (b) < 0))
+#define ddi_time_after64(a, b) ddi_time_before64(b, a)
+#define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b))
+#define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a)
+
+#define delay(ticks) schedule_timeout_uninterruptible(ticks)
+
+/* usleep_range() introduced in 2.6.36 */
+#ifndef HAVE_USLEEP_RANGE
+static inline void
+usleep_range(unsigned long min, unsigned long max)
+{
+ unsigned int min_ms = min / USEC_PER_MSEC;
+
+ if (min >= MAX_UDELAY_MS)
+ msleep(min_ms);
+ else
+ udelay(min);
+}
+#endif /* HAVE_USLEEP_RANGE */
+
+#define SEC_TO_TICK(sec) ((sec) * HZ)
+#define MSEC_TO_TICK(ms) msecs_to_jiffies(ms)
+#define USEC_TO_TICK(us) usecs_to_jiffies(us)
+#define NSEC_TO_TICK(ns) usecs_to_jiffies(ns / NSEC_PER_USEC)
+
+#endif /* _SPL_TIMER_H */
diff --git a/include/spl/sys/tsd.h b/include/spl/sys/tsd.h
new file mode 100644
index 000000000..39a291bf3
--- /dev/null
+++ b/include/spl/sys/tsd.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TSD_H
+#define _SPL_TSD_H
+
+#include <sys/types.h>
+
+#define TSD_HASH_TABLE_BITS_DEFAULT 9
+#define TSD_KEYS_MAX 32768
+#define DTOR_PID (PID_MAX_LIMIT+1)
+#define PID_KEY (TSD_KEYS_MAX+1)
+
+typedef void (*dtor_func_t)(void *);
+
+extern int tsd_set(uint_t, void *);
+extern void *tsd_get(uint_t);
+extern void *tsd_get_by_thread(uint_t, kthread_t *);
+extern void tsd_create(uint_t *, dtor_func_t);
+extern void tsd_destroy(uint_t *);
+extern void tsd_exit(void);
+
+int spl_tsd_init(void);
+void spl_tsd_fini(void);
+
+#endif /* _SPL_TSD_H */
diff --git a/include/spl/sys/types.h b/include/spl/sys/types.h
new file mode 100644
index 000000000..a5b478127
--- /dev/null
+++ b/include/spl/sys/types.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TYPES_H
+#define _SPL_TYPES_H
+
+#include <linux/types.h>
+
+#ifndef ULLONG_MAX
+#define ULLONG_MAX (~0ULL)
+#endif
+
+#ifndef LLONG_MAX
+#define LLONG_MAX ((long long)(~0ULL>>1))
+#endif
+
+typedef enum {
+ B_FALSE = 0,
+ B_TRUE = 1
+} boolean_t;
+
+typedef unsigned char uchar_t;
+typedef unsigned short ushort_t;
+typedef unsigned int uint_t;
+typedef unsigned long ulong_t;
+typedef unsigned long long u_longlong_t;
+typedef long long longlong_t;
+
+typedef unsigned long intptr_t;
+typedef unsigned long long rlim64_t;
+
+typedef struct task_struct kthread_t;
+typedef struct task_struct proc_t;
+
+typedef struct timespec timestruc_t;
+typedef struct timespec timespec_t;
+typedef longlong_t hrtime_t;
+
+typedef int id_t;
+typedef short pri_t;
+typedef short index_t;
+typedef longlong_t offset_t;
+typedef u_longlong_t u_offset_t;
+typedef ulong_t pgcnt_t;
+
+typedef int major_t;
+typedef int minor_t;
+
+#endif /* _SPL_TYPES_H */
diff --git a/include/spl/sys/types32.h b/include/spl/sys/types32.h
new file mode 100644
index 000000000..c60ba8c97
--- /dev/null
+++ b/include/spl/sys/types32.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TYPES32_H
+#define _SPL_TYPES32_H
+
+#include <sys/types.h>
+
+typedef uint32_t caddr32_t;
+typedef int32_t daddr32_t;
+typedef int32_t time32_t;
+typedef uint32_t size32_t;
+
+#endif /* _SPL_TYPES32_H */
diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h
new file mode 100644
index 000000000..64c452b8d
--- /dev/null
+++ b/include/spl/sys/uio.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_UIO_H
+#define _SPL_UIO_H
+
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <asm/uaccess.h>
+#include <sys/types.h>
+
+typedef struct iovec iovec_t;
+
+typedef enum uio_rw {
+ UIO_READ = 0,
+ UIO_WRITE = 1,
+} uio_rw_t;
+
+typedef enum uio_seg {
+ UIO_USERSPACE = 0,
+ UIO_SYSSPACE = 1,
+ UIO_USERISPACE = 2,
+ UIO_BVEC = 3,
+} uio_seg_t;
+
+typedef struct uio {
+ union {
+ const struct iovec *uio_iov;
+ const struct bio_vec *uio_bvec;
+ };
+ int uio_iovcnt;
+ offset_t uio_loffset;
+ uio_seg_t uio_segflg;
+ uint16_t uio_fmode;
+ uint16_t uio_extflg;
+ offset_t uio_limit;
+ ssize_t uio_resid;
+ size_t uio_skip;
+} uio_t;
+
+typedef struct aio_req {
+ uio_t *aio_uio;
+ void *aio_private;
+} aio_req_t;
+
+typedef enum xuio_type {
+ UIOTYPE_ASYNCIO,
+ UIOTYPE_ZEROCOPY,
+} xuio_type_t;
+
+
+#define UIOA_IOV_MAX 16
+
+typedef struct uioa_page_s {
+ int uioa_pfncnt;
+ void **uioa_ppp;
+ caddr_t uioa_base;
+ size_t uioa_len;
+} uioa_page_t;
+
+typedef struct xuio {
+ uio_t xu_uio;
+ enum xuio_type xu_type;
+ union {
+ struct {
+ uint32_t xu_a_state;
+ ssize_t xu_a_mbytes;
+ uioa_page_t *xu_a_lcur;
+ void **xu_a_lppp;
+ void *xu_a_hwst[4];
+ uioa_page_t xu_a_locked[UIOA_IOV_MAX];
+ } xu_aio;
+
+ struct {
+ int xu_zc_rw;
+ void *xu_zc_priv;
+ } xu_zc;
+ } xu_ext;
+} xuio_t;
+
+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv
+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw
+
+#endif /* SPL_UIO_H */
diff --git a/include/spl/sys/user.h b/include/spl/sys/user.h
new file mode 100644
index 000000000..b12cb240e
--- /dev/null
+++ b/include/spl/sys/user.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2015 Cluster Inc.
+ * Produced at ClusterHQ Inc (cf, DISCLAIMER).
+ * Written by Richard Yao <[email protected]>.
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_USER_H
+#define _SPL_USER_H
+
+/*
+ * We have uf_info_t for areleasef(). We implement areleasef() using a global
+ * linked list of all open file descriptors with the task structs referenced,
+ * so accessing the correct descriptor from areleasef() only requires knowing
+ * about the Linux task_struct. Since this is internal to our compatibility
+ * layer, we make it an opaque type.
+ *
+ * XXX: If the descriptor changes under us and we do not do a getf() between
+ * the change and using it, we would get an incorrect reference.
+ */
+
+struct uf_info;
+typedef struct uf_info uf_info_t;
+
+#define P_FINFO(x) ((uf_info_t *)x)
+
+#endif /* SPL_USER_H */
diff --git a/include/spl/sys/vfs.h b/include/spl/sys/vfs.h
new file mode 100644
index 000000000..0d5e1d51d
--- /dev/null
+++ b/include/spl/sys/vfs.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ZFS_H
+#define _SPL_ZFS_H
+
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+
+#define MAXFIDSZ 64
+
+typedef struct spl_fid {
+ union {
+ long fid_pad;
+ struct {
+ ushort_t len; /* length of data in bytes */
+ char data[MAXFIDSZ]; /* data (variable len) */
+ } _fid;
+ } un;
+} fid_t;
+
+#define fid_len un._fid.len
+#define fid_data un._fid.data
+
+#endif /* SPL_ZFS_H */
diff --git a/include/spl/sys/vmem.h b/include/spl/sys/vmem.h
new file mode 100644
index 000000000..a9b12eeb9
--- /dev/null
+++ b/include/spl/sys/vmem.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VMEM_H
+#define _SPL_VMEM_H
+
+#include <sys/kmem.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+typedef struct vmem { } vmem_t;
+
+extern vmem_t *heap_arena;
+extern vmem_t *zio_alloc_arena;
+extern vmem_t *zio_arena;
+
+extern size_t vmem_size(vmem_t *vmp, int typemask);
+
+/*
+ * Memory allocation interfaces
+ */
+#define VMEM_ALLOC 0x01
+#define VMEM_FREE 0x02
+
+#ifndef VMALLOC_TOTAL
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+#endif
+
+/*
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
+ *
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
+ *
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc(). It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ * vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ * only a few 100MB are available. The kernel will handle it by spinning
+ * when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ * lock which serializes all allocations.
+ *
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ * list. The former will sum the allocations while the latter will print
+ * them to user space in a way that user space can keep the lock held
+ * indefinitely. When the total number of mapped allocations is large
+ * (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ * memory is used, it simply does not work on virtual memory. Certain
+ * Linux structures (e.g. the superblock) use them and might be embedded
+ * into a structure from Illumos. This makes using Linux virtual memory
+ * unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
+ */
+
+#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz))
+#define vmem_qcache_reap(ptr) ((void)0)
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
+
+int spl_vmem_init(void);
+void spl_vmem_fini(void);
+
+#endif /* _SPL_VMEM_H */
diff --git a/include/spl/sys/vmsystm.h b/include/spl/sys/vmsystm.h
new file mode 100644
index 000000000..2b48fe0e3
--- /dev/null
+++ b/include/spl/sys/vmsystm.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VMSYSTM_H
+#define _SPL_VMSYSTM_H
+
+#include <linux/mmzone.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <sys/types.h>
+#include <asm/uaccess.h>
+
+#define membar_producer() smp_wmb()
+#define physmem totalram_pages
+#define freemem (nr_free_pages() + \
+ global_page_state(NR_INACTIVE_FILE) + \
+ global_page_state(NR_INACTIVE_ANON) + \
+ global_page_state(NR_SLAB_RECLAIMABLE))
+
+#define xcopyin(from, to, size) copy_from_user(to, from, size)
+#define xcopyout(from, to, size) copy_to_user(to, from, size)
+
+static __inline__ int
+copyin(const void *from, void *to, size_t len)
+{
+ /* On error copyin routine returns -1 */
+ if (xcopyin(from, to, len))
+ return (-1);
+
+ return (0);
+}
+
+static __inline__ int
+copyout(const void *from, void *to, size_t len)
+{
+ /* On error copyout routine returns -1 */
+ if (xcopyout(from, to, len))
+ return (-1);
+
+ return (0);
+}
+
+static __inline__ int
+copyinstr(const void *from, void *to, size_t len, size_t *done)
+{
+ size_t rc;
+
+ if (len == 0)
+ return (-ENAMETOOLONG);
+
+ /* XXX: Should return ENAMETOOLONG if 'strlen(from) > len' */
+
+ memset(to, 0, len);
+ rc = copyin(from, to, len - 1);
+ if (done != NULL)
+ *done = rc;
+
+ return (0);
+}
+
+#endif /* SPL_VMSYSTM_H */
diff --git a/include/spl/sys/vnode.h b/include/spl/sys/vnode.h
new file mode 100644
index 000000000..a3f7828e7
--- /dev/null
+++ b/include/spl/sys/vnode.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VNODE_H
+#define _SPL_VNODE_H
+
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <sys/kmem.h>
+#include <sys/mutex.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/user.h>
+
+/*
+ * Prior to linux-2.6.33 only O_DSYNC semantics were implemented and
+ * they used the O_SYNC flag. As of linux-2.6.33 the this behavior
+ * was properly split in to O_SYNC and O_DSYNC respectively.
+ */
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+#define FREAD 1
+#define FWRITE 2
+#define FCREAT O_CREAT
+#define FTRUNC O_TRUNC
+#define FOFFMAX O_LARGEFILE
+#define FSYNC O_SYNC
+#define FDSYNC O_DSYNC
+#define FRSYNC O_SYNC
+#define FEXCL O_EXCL
+#define FDIRECT O_DIRECT
+#define FAPPEND O_APPEND
+
+#define FNODSYNC 0x10000 /* fsync pseudo flag */
+#define FNOFOLLOW 0x20000 /* don't follow symlinks */
+
+#define F_FREESP 11 /* Free file space */
+
+
+/*
+ * The vnode AT_ flags are mapped to the Linux ATTR_* flags.
+ * This allows them to be used safely with an iattr structure.
+ * The AT_XVATTR flag has been added and mapped to the upper
+ * bit range to avoid conflicting with the standard Linux set.
+ */
+#undef AT_UID
+#undef AT_GID
+
+#define AT_MODE ATTR_MODE
+#define AT_UID ATTR_UID
+#define AT_GID ATTR_GID
+#define AT_SIZE ATTR_SIZE
+#define AT_ATIME ATTR_ATIME
+#define AT_MTIME ATTR_MTIME
+#define AT_CTIME ATTR_CTIME
+
+#define ATTR_XVATTR (1 << 31)
+#define AT_XVATTR ATTR_XVATTR
+
+#define ATTR_IATTR_MASK (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | \
+ ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_FILE)
+
+#define CRCREAT 0x01
+#define RMFILE 0x02
+
+#define B_INVAL 0x01
+#define B_TRUNC 0x02
+
+#define LOOKUP_DIR 0x01
+#define LOOKUP_XATTR 0x02
+#define CREATE_XATTR_DIR 0x04
+#define ATTR_NOACLCHECK 0x20
+
+typedef enum vtype {
+ VNON = 0,
+ VREG = 1,
+ VDIR = 2,
+ VBLK = 3,
+ VCHR = 4,
+ VLNK = 5,
+ VFIFO = 6,
+ VDOOR = 7,
+ VPROC = 8,
+ VSOCK = 9,
+ VPORT = 10,
+ VBAD = 11
+} vtype_t;
+
+typedef struct vattr {
+ enum vtype va_type; /* vnode type */
+ uint_t va_mask; /* attribute bit-mask */
+ ushort_t va_mode; /* acc mode */
+ uid_t va_uid; /* owner uid */
+ gid_t va_gid; /* owner gid */
+ long va_fsid; /* fs id */
+ long va_nodeid; /* node # */
+ uint32_t va_nlink; /* # links */
+ uint64_t va_size; /* file size */
+ struct timespec va_atime; /* last acc */
+ struct timespec va_mtime; /* last mod */
+ struct timespec va_ctime; /* last chg */
+ dev_t va_rdev; /* dev */
+ uint64_t va_nblocks; /* space used */
+ uint32_t va_blksize; /* block size */
+ uint32_t va_seq; /* sequence */
+ struct dentry *va_dentry; /* dentry to wire */
+} vattr_t;
+
+typedef struct vnode {
+ struct file *v_file;
+ kmutex_t v_lock; /* protects vnode fields */
+ uint_t v_flag; /* vnode flags (see below) */
+ uint_t v_count; /* reference count */
+ void *v_data; /* private data for fs */
+ struct vfs *v_vfsp; /* ptr to containing VFS */
+ struct stdata *v_stream; /* associated stream */
+ enum vtype v_type; /* vnode type */
+ dev_t v_rdev; /* device (VCHR, VBLK) */
+ gfp_t v_gfp_mask; /* original mapping gfp mask */
+} vnode_t;
+
+typedef struct vn_file {
+ int f_fd; /* linux fd for lookup */
+ struct task_struct *f_task; /* linux task this fd belongs to */
+ struct file *f_file; /* linux file struct */
+ atomic_t f_ref; /* ref count */
+ kmutex_t f_lock; /* struct lock */
+ loff_t f_offset; /* offset */
+ vnode_t *f_vnode; /* vnode */
+ struct list_head f_list; /* list referenced file_t's */
+} file_t;
+
+extern vnode_t *vn_alloc(int flag);
+void vn_free(vnode_t *vp);
+extern vtype_t vn_mode_to_vtype(mode_t);
+extern mode_t vn_vtype_to_mode(vtype_t);
+extern int vn_open(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2);
+extern int vn_openat(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd);
+extern int vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len,
+ offset_t off, uio_seg_t seg, int x1, rlim64_t x2,
+ void *x3, ssize_t *residp);
+extern int vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4);
+extern int vn_seek(vnode_t *vp, offset_t o, offset_t *op, void *ct);
+
+extern int vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4);
+extern int vn_fsync(vnode_t *vp, int flags, void *x3, void *x4);
+extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
+ offset_t offset, void *x6, void *x7);
+extern file_t *vn_getf(int fd);
+extern void vn_releasef(int fd);
+extern void vn_areleasef(int fd, uf_info_t *fip);
+extern int vn_set_pwd(const char *filename);
+
+int spl_vn_init(void);
+void spl_vn_fini(void);
+
+#define VOP_CLOSE vn_close
+#define VOP_SEEK vn_seek
+#define VOP_GETATTR vn_getattr
+#define VOP_FSYNC vn_fsync
+#define VOP_SPACE vn_space
+#define VOP_PUTPAGE(vp, o, s, f, x1, x2) ((void)0)
+#define vn_is_readonly(vp) 0
+#define getf vn_getf
+#define releasef vn_releasef
+#define areleasef vn_areleasef
+
+extern vnode_t *rootdir;
+
+#endif /* SPL_VNODE_H */
diff --git a/include/spl/sys/wait.h b/include/spl/sys/wait.h
new file mode 100644
index 000000000..5311ff8b9
--- /dev/null
+++ b/include/spl/sys/wait.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2007-2014 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_WAIT_H
+#define _SPL_WAIT_H
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#ifndef HAVE_WAIT_ON_BIT_ACTION
+#define spl_wait_on_bit(word, bit, mode) wait_on_bit(word, bit, mode)
+#else
+
+static inline int
+spl_bit_wait(void *word)
+{
+ schedule();
+ return (0);
+}
+
+#define spl_wait_on_bit(word, bit, mode) \
+ wait_on_bit(word, bit, spl_bit_wait, mode)
+
+#endif /* HAVE_WAIT_ON_BIT_ACTION */
+
+#ifdef HAVE_WAIT_QUEUE_ENTRY_T
+typedef wait_queue_head_t spl_wait_queue_head_t;
+typedef wait_queue_entry_t spl_wait_queue_entry_t;
+#else
+typedef wait_queue_head_t spl_wait_queue_head_t;
+typedef wait_queue_t spl_wait_queue_entry_t;
+#endif
+
+#endif /* SPL_WAIT_H */
diff --git a/include/spl/sys/zmod.h b/include/spl/sys/zmod.h
new file mode 100644
index 000000000..95c1a3ed7
--- /dev/null
+++ b/include/spl/sys/zmod.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * z_compress_level/z_uncompress are nearly identical copies of the
+ * compress2/uncompress functions provided by the official zlib package
+ * available at http://zlib.net/. The only changes made we to slightly
+ * adapt the functions called to match the linux kernel implementation
+ * of zlib. The full zlib license follows:
+ *
+ * zlib.h -- interface of the 'zlib' general purpose compression library
+ * version 1.2.5, April 19th, 2010
+ *
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * Jean-loup Gailly
+ * Mark Adler
+ */
+
+#ifndef _SPL_ZMOD_H
+#define _SPL_ZMOD_H
+
+#include <sys/types.h>
+#include <linux/zlib.h>
+
+#ifdef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
+#define spl_zlib_deflate_workspacesize(wb, ml) \
+ zlib_deflate_workspacesize(wb, ml)
+#else
+#define spl_zlib_deflate_workspacesize(wb, ml) \
+ zlib_deflate_workspacesize()
+#endif /* HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
+
+extern int z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level);
+extern int z_uncompress(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen);
+
+int spl_zlib_init(void);
+void spl_zlib_fini(void);
+
+#endif /* SPL_ZMOD_H */
diff --git a/include/spl/sys/zone.h b/include/spl/sys/zone.h
new file mode 100644
index 000000000..b2efd13b8
--- /dev/null
+++ b/include/spl/sys/zone.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ZONE_H
+#define _SPL_ZONE_H
+
+#include <sys/byteorder.h>
+
+#define GLOBAL_ZONEID 0
+
+#define zone_dataset_visible(x, y) (1)
+#define crgetzoneid(x) (GLOBAL_ZONEID)
+#define INGLOBALZONE(z) (1)
+
+#endif /* SPL_ZONE_H */
diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5
new file mode 100644
index 000000000..30d9fc754
--- /dev/null
+++ b/man/man5/spl-module-parameters.5
@@ -0,0 +1,357 @@
+'\" te
+.\"
+.\" Copyright 2013 Turbo Fredriksson <[email protected]>. All rights reserved.
+.\"
+.TH SPL-MODULE-PARAMETERS 5 "Oct 28, 2017"
+.SH NAME
+spl\-module\-parameters \- SPL module parameters
+.SH DESCRIPTION
+.sp
+.LP
+Description of the different parameters to the SPL module.
+
+.SS "Module parameters"
+.sp
+.LP
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_expire\fR (uint)
+.ad
+.RS 12n
+Cache expiration is part of default Illumos cache behavior. The idea is
+that objects in magazines which have not been recently accessed should be
+returned to the slabs periodically. This is known as cache aging and
+when enabled objects will be typically returned after 15 seconds.
+.sp
+On the other hand Linux slabs are designed to never move objects back to
+the slabs unless there is memory pressure. This is possible because under
+Linux the cache will be notified when memory is low and objects can be
+released.
+.sp
+By default only the Linux method is enabled. It has been shown to improve
+responsiveness on low memory systems and not negatively impact the performance
+of systems with more memory. This policy may be changed by setting the
+\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled
+concurrently.
+.sp
+0x01 - Aging (Illumos), 0x02 - Low memory (Linux)
+.sp
+Default value: \fB0x02\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_kmem_threads\fR (uint)
+.ad
+.RS 12n
+The number of threads created for the spl_kmem_cache task queue. This task
+queue is responsible for allocating new slabs for use by the kmem caches.
+For the majority of systems and workloads only a small number of threads are
+required.
+.sp
+Default value: \fB4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_reclaim\fR (uint)
+.ad
+.RS 12n
+When this is set it prevents Linux from being able to rapidly reclaim all the
+memory held by the kmem caches. This may be useful in circumstances where
+it's preferable that Linux reclaim memory from some other subsystem first.
+Setting this will increase the likelihood out of memory events on a memory
+constrained system.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_obj_per_slab\fR (uint)
+.ad
+.RS 12n
+The preferred number of objects per slab in the cache. In general, a larger
+value will increase the caches memory footprint while decreasing the time
+required to perform an allocation. Conversely, a smaller value will minimize
+the footprint and improve cache reclaim time but individual allocations may
+take longer.
+.sp
+Default value: \fB8\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_obj_per_slab_min\fR (uint)
+.ad
+.RS 12n
+The minimum number of objects allowed per slab. Normally slabs will contain
+\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very
+large objects it's desirable to only have a few, or even just one, object per
+slab.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_max_size\fR (uint)
+.ad
+.RS 12n
+The maximum size of a kmem cache slab in MiB. This effectively limits
+the maximum cache object size to \fBspl_kmem_cache_max_size\fR /
+\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with
+object sized larger than this limit.
+.sp
+Default value: \fB32 (64-bit) or 4 (32-bit)\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_slab_limit\fR (uint)
+.ad
+.RS 12n
+For small objects the Linux slab allocator should be used to make the most
+efficient use of the memory. However, large objects are not supported by
+the Linux slab and therefore the SPL implementation is preferred. This
+value is used to determine the cutoff between a small and large object.
+.sp
+Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated
+using the Linux slab allocator, large objects use the SPL allocator. A
+cutoff of 16K was determined to be optimal for architectures using 4K pages.
+.sp
+Default value: \fB16,384\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_kmem_limit\fR (uint)
+.ad
+.RS 12n
+Depending on the size of a cache object it may be backed by kmalloc()'d
+or vmalloc()'d memory. This is because the size of the required allocation
+greatly impacts the best way to allocate the memory.
+.sp
+When objects are small and only a small number of memory pages need to be
+allocated, ideally just one, then kmalloc() is very efficient. However,
+when allocating multiple pages with kmalloc() it gets increasingly expensive
+because the pages must be physically contiguous.
+.sp
+For this reason we shift to vmalloc() for slabs of large objects which
+which removes the need for contiguous pages. We cannot use vmalloc() in
+all cases because there is significant locking overhead involved. This
+function takes a single global lock over the entire virtual address range
+which serializes all allocations. Using slightly different allocation
+functions for small and large objects allows us to handle a wide range of
+object sizes.
+.sp
+The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff
+size. One quarter the PAGE_SIZE is used as the default value because
+\fBspl_kmem_cache_obj_per_slab\fR defaults to 16. This means that at
+most we will need to allocate four contiguous pages.
+.sp
+Default value: \fBPAGE_SIZE/4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_warn\fR (uint)
+.ad
+.RS 12n
+As a general rule kmem_alloc() allocations should be small, preferably
+just a few pages since they must by physically contiguous. Therefore, a
+rate limited warning will be printed to the console for any kmem_alloc()
+which exceeds a reasonable threshold.
+.sp
+The default warning threshold is set to eight pages but capped at 32K to
+accommodate systems using large pages. This value was selected to be small
+enough to ensure the largest allocations are quickly noticed and fixed.
+But large enough to avoid logging any warnings when a allocation size is
+larger than optimal but not a serious concern. Since this value is tunable,
+developers are encouraged to set it lower when testing so any new largish
+allocations are quickly caught. These warnings may be disabled by setting
+the threshold to zero.
+.sp
+Default value: \fB32,768\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_max\fR (uint)
+.ad
+.RS 12n
+Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+Allocations which are marginally smaller than this limit may succeed but
+should still be avoided due to the expense of locating a contiguous range
+of free pages. Therefore, a maximum kmem size with reasonable safely
+margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+will quickly fail. Vmem_alloc() allocations less than or equal to this
+value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+.sp
+Default value: \fBKMALLOC_MAX_SIZE/4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_magazine_size\fR (uint)
+.ad
+.RS 12n
+Cache magazines are an optimization designed to minimize the cost of
+allocating memory. They do this by keeping a per-cpu cache of recently
+freed objects, which can then be reallocated without taking a lock. This
+can improve performance on highly contended caches. However, because
+objects in magazines will prevent otherwise empty slabs from being
+immediately released this may not be ideal for low memory machines.
+.sp
+For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a
+maximum magazine size. When this value is set to 0 the magazine size will
+be automatically determined based on the object size. Otherwise magazines
+will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+may never be entirely disabled in this implementation.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_hostid\fR (ulong)
+.ad
+.RS 12n
+The system hostid, when set this can be used to uniquely identify a system.
+By default this value is set to zero which indicates the hostid is disabled.
+It can be explicitly enabled by placing a unique non-zero value in
+\fB/etc/hostid/\fR.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_hostid_path\fR (charp)
+.ad
+.RS 12n
+The expected path to locate the system hostid when specified. This value
+may be overridden for non-standard configurations.
+.sp
+Default value: \fB/etc/hostid\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_panic_halt\fR (uint)
+.ad
+.RS 12n
+Cause a kernel panic on assertion failures. When not enabled, the thread is
+halted to facilitate further debugging.
+.sp
+Set to a non-zero value to enable.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_kick\fR (uint)
+.ad
+.RS 12n
+Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will
+scan all the taskqs. If any of them have a pending task more than 5 seconds old,
+it will kick it to spawn more threads. This can be used if you find a rare
+deadlock occurs because one or more taskqs didn't spawn a thread when it should.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_bind\fR (int)
+.ad
+.RS 12n
+Bind taskq threads to specific CPUs. When enabled all taskq threads will
+be distributed evenly over the available CPUs. By default, this behavior
+is disabled to allow the Linux scheduler the maximum flexibility to determine
+where a thread should run.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_dynamic\fR (int)
+.ad
+.RS 12n
+Allow dynamic taskqs. When enabled taskqs which set the TASKQ_DYNAMIC flag
+will by default create only a single thread. New threads will be created on
+demand up to a maximum allowed number to facilitate the completion of
+outstanding tasks. Threads which are no longer needed will be promptly
+destroyed. By default this behavior is enabled but it can be disabled to
+aid performance analysis or troubleshooting.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_priority\fR (int)
+.ad
+.RS 12n
+Allow newly created taskq threads to set a non-default scheduler priority.
+When enabled the priority specified when a taskq is created will be applied
+to all threads created by that taskq. When disabled all threads will use
+the default Linux kernel thread priority. By default, this behavior is
+enabled.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_sequential\fR (int)
+.ad
+.RS 12n
+The number of items a taskq worker thread must handle without interruption
+before requesting a new worker thread be spawned. This is used to control
+how quickly taskqs ramp up the number of threads processing the queue.
+Because Linux thread creation and destruction are relatively inexpensive a
+small default value has been selected. This means that normally threads will
+be created aggressively which is desirable. Increasing this value will
+result in a slower thread creation rate which may be preferable for some
+configurations.
+.sp
+Default value: \fB4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_max_show_tasks\fR (uint)
+.ad
+.RS 12n
+The maximum number of tasks per pending list in each taskq shown in
+/proc/spl/{taskq,taskq-all}. Write 0 to turn off the limit. The proc file will
+walk the lists with lock held, reading it could cause a lock up if the list
+grow too large without limiting the output. "(truncated)" will be shown if the
+list is larger than the limit.
+.sp
+Default value: \fB512\fR
+.RE
diff --git a/module/spl/THIRDPARTYLICENSE.gplv2 b/module/spl/THIRDPARTYLICENSE.gplv2
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/module/spl/THIRDPARTYLICENSE.gplv2
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/module/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/spl/THIRDPARTYLICENSE.gplv2.descrip
new file mode 100644
index 000000000..78535a8ee
--- /dev/null
+++ b/module/spl/THIRDPARTYLICENSE.gplv2.descrip
@@ -0,0 +1 @@
+COMPATIBILITY LAYER FOR OPENZFS ON LINUX
diff --git a/module/spl/spl-atomic.c b/module/spl/spl-atomic.c
new file mode 100644
index 000000000..47ed1886e
--- /dev/null
+++ b/module/spl/spl-atomic.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Atomic Implementation.
+ */
+
+#include <sys/atomic.h>
+
+#ifdef ATOMIC_SPINLOCK
+/* Global atomic lock declarations */
+DEFINE_SPINLOCK(atomic32_lock);
+DEFINE_SPINLOCK(atomic64_lock);
+
+EXPORT_SYMBOL(atomic32_lock);
+EXPORT_SYMBOL(atomic64_lock);
+#endif /* ATOMIC_SPINLOCK */
diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c
new file mode 100644
index 000000000..f0060bbdc
--- /dev/null
+++ b/module/spl/spl-condvar.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <linux/hrtimer.h>
+
+void
+__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
+{
+ ASSERT(cvp);
+ ASSERT(name == NULL);
+ ASSERT(type == CV_DEFAULT);
+ ASSERT(arg == NULL);
+
+ cvp->cv_magic = CV_MAGIC;
+ init_waitqueue_head(&cvp->cv_event);
+ init_waitqueue_head(&cvp->cv_destroy);
+ atomic_set(&cvp->cv_waiters, 0);
+ atomic_set(&cvp->cv_refs, 1);
+ cvp->cv_mutex = NULL;
+}
+EXPORT_SYMBOL(__cv_init);
+
+static int
+cv_destroy_wakeup(kcondvar_t *cvp)
+{
+ if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
+ ASSERT(cvp->cv_mutex == NULL);
+ ASSERT(!waitqueue_active(&cvp->cv_event));
+ return (1);
+ }
+
+ return (0);
+}
+
+void
+__cv_destroy(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+
+ cvp->cv_magic = CV_DESTROY;
+ atomic_dec(&cvp->cv_refs);
+
+ /* Block until all waiters are woken and references dropped. */
+ while (cv_destroy_wakeup(cvp) == 0)
+ wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
+
+ ASSERT3P(cvp->cv_mutex, ==, NULL);
+ ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
+ ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
+ ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
+}
+EXPORT_SYMBOL(__cv_destroy);
+
+static void
+cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+ atomic_inc(&cvp->cv_refs);
+
+ m = ACCESS_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ io_schedule();
+ else
+ schedule();
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+}
+
+void
+__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait);
+
+void
+__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
+}
+EXPORT_SYMBOL(__cv_wait_io);
+
+void
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait_sig);
+
+#if defined(HAVE_IO_SCHEDULE_TIMEOUT)
+#define spl_io_schedule_timeout(t) io_schedule_timeout(t)
+#else
+static void
+__cv_wakeup(unsigned long data)
+{
+ wake_up_process((struct task_struct *)data);
+}
+
+static long
+spl_io_schedule_timeout(long time_left)
+{
+ long expire_time = jiffies + time_left;
+ struct timer_list timer;
+
+ init_timer(&timer);
+ setup_timer(&timer, __cv_wakeup, (unsigned long)current);
+ timer.expires = expire_time;
+ add_timer(&timer);
+
+ io_schedule();
+
+ del_timer_sync(&timer);
+ time_left = expire_time - jiffies;
+
+ return (time_left < 0 ? 0 : time_left);
+}
+#endif
+
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+ int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ clock_t time_left;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ /* XXX - Does not handle jiffie wrap properly */
+ time_left = expire_time - jiffies;
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = ACCESS_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ time_left = spl_io_schedule_timeout(time_left);
+ else
+ time_left = schedule_timeout(time_left);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+ return (time_left > 0 ? time_left : -1);
+}
+
+clock_t
+__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait);
+
+clock_t
+__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 1));
+}
+EXPORT_SYMBOL(__cv_timedwait_io);
+
+clock_t
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_INTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait_sig);
+
+/*
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+ int state)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ hrtime_t time_left;
+ ktime_t ktime_left;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ time_left = expire_time - gethrtime();
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = ACCESS_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ /*
+ * Allow a 100 us range to give kernel an opportunity to coalesce
+ * interrupts
+ */
+ ktime_left = ktime_set(0, time_left);
+ schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC,
+ HRTIMER_MODE_REL);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ mutex_enter(mp);
+ time_left = expire_time - gethrtime();
+ return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1);
+}
+
+/*
+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
+ */
+static clock_t
+cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag, int state)
+{
+ if (res > 1) {
+ /*
+ * Align expiration to the specified resolution.
+ */
+ if (flag & CALLOUT_FLAG_ROUNDUP)
+ tim += res - 1;
+ tim = (tim / res) * res;
+ }
+
+ if (!(flag & CALLOUT_FLAG_ABSOLUTE))
+ tim += gethrtime();
+
+ return (__cv_timedwait_hires(cvp, mp, tim, state));
+}
+
+clock_t
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+ int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_UNINTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_hires);
+
+clock_t
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_INTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_sig_hires);
+
+void
+__cv_signal(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+ * waiter will be set runable with each call to wake_up().
+ * Additionally wake_up() holds a spin_lock assoicated with
+ * the wait queue to ensure we don't race waking up processes.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_signal);
+
+void
+__cv_broadcast(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * Wake_up_all() will wake up all waiters even those which
+ * have the WQ_FLAG_EXCLUSIVE flag set.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up_all(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_broadcast);
diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c
new file mode 100644
index 000000000..ea3e903f9
--- /dev/null
+++ b/module/spl/spl-cred.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/cred.h>
+
+static int
+#ifdef HAVE_KUIDGID_T
+cr_groups_search(const struct group_info *group_info, kgid_t grp)
+#else
+cr_groups_search(const struct group_info *group_info, gid_t grp)
+#endif
+{
+ unsigned int left, right, mid;
+ int cmp;
+
+ if (!group_info)
+ return (0);
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ mid = (left + right) / 2;
+ cmp = KGID_TO_SGID(grp) -
+ KGID_TO_SGID(GROUP_AT(group_info, mid));
+
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return (1);
+ }
+ return (0);
+}
+
+/* Hold a reference on the credential */
+void
+crhold(cred_t *cr)
+{
+ (void) get_cred((const cred_t *)cr);
+}
+
+/* Free a reference on the credential */
+void
+crfree(cred_t *cr)
+{
+ put_cred((const cred_t *)cr);
+}
+
+/* Return the number of supplemental groups */
+int
+crgetngroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = gi->ngroups;
+#ifndef HAVE_GROUP_INFO_GID
+ /*
+ * For Linux <= 4.8,
+ * crgetgroups will only returns gi->blocks[0], which contains only
+ * the first NGROUPS_PER_BLOCK groups.
+ */
+ if (rc > NGROUPS_PER_BLOCK) {
+ WARN_ON_ONCE(1);
+ rc = NGROUPS_PER_BLOCK;
+ }
+#endif
+ return (rc);
+}
+
+/*
+ * Return an array of supplemental gids. The returned address is safe
+ * to use as long as the caller has taken a reference with crhold().
+ *
+ * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d
+ * array via ->gid.
+ */
+gid_t *
+crgetgroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ gid_t *gids = NULL;
+
+ gi = cr->group_info;
+#ifdef HAVE_GROUP_INFO_GID
+ gids = KGIDP_TO_SGIDP(gi->gid);
+#else
+ if (gi->nblocks > 0)
+ gids = KGIDP_TO_SGIDP(gi->blocks[0]);
+#endif
+ return (gids);
+}
+
+/* Check if the passed gid is available in supplied credential. */
+int
+groupmember(gid_t gid, const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = cr_groups_search(gi, SGID_TO_KGID(gid));
+
+ return (rc);
+}
+
+/* Return the effective user id */
+uid_t
+crgetuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->euid));
+}
+
+/* Return the real user id */
+uid_t
+crgetruid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->uid));
+}
+
+/* Return the saved user id */
+uid_t
+crgetsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->suid));
+}
+
+/* Return the filesystem user id */
+uid_t
+crgetfsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->fsuid));
+}
+
+/* Return the effective group id */
+gid_t
+crgetgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->egid));
+}
+
+/* Return the real group id */
+gid_t
+crgetrgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->gid));
+}
+
+/* Return the saved group id */
+gid_t
+crgetsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->sgid));
+}
+
+/* Return the filesystem group id */
+gid_t
+crgetfsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->fsgid));
+}
+
+EXPORT_SYMBOL(crhold);
+EXPORT_SYMBOL(crfree);
+EXPORT_SYMBOL(crgetuid);
+EXPORT_SYMBOL(crgetruid);
+EXPORT_SYMBOL(crgetsuid);
+EXPORT_SYMBOL(crgetfsuid);
+EXPORT_SYMBOL(crgetgid);
+EXPORT_SYMBOL(crgetrgid);
+EXPORT_SYMBOL(crgetsgid);
+EXPORT_SYMBOL(crgetfsgid);
+EXPORT_SYMBOL(crgetngroups);
+EXPORT_SYMBOL(crgetgroups);
+EXPORT_SYMBOL(groupmember);
diff --git a/module/spl/spl-err.c b/module/spl/spl-err.c
new file mode 100644
index 000000000..6b71296e8
--- /dev/null
+++ b/module/spl/spl-err.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Error Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <linux/ratelimit.h>
+
+/*
+ * It is often useful to actually have the panic crash the node so you
+ * can then get notified of the event, get the crashdump for later
+ * analysis and other such goodies.
+ * But we would still default to the current default of not to do that.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_panic_halt;
+module_param(spl_panic_halt, uint, 0644);
+MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
+/* END CSTYLED */
+
+/*
+ * Limit the number of stack traces dumped to not more than 5 every
+ * 60 seconds to prevent denial-of-service attacks from debug code.
+ */
+DEFINE_RATELIMIT_STATE(dumpstack_ratelimit_state, 60 * HZ, 5);
+
+void
+spl_dumpstack(void)
+{
+ if (__ratelimit(&dumpstack_ratelimit_state)) {
+ printk("Showing stack for process %d\n", current->pid);
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(spl_dumpstack);
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char msg[MAXMSGLEN];
+ va_list ap;
+
+ newfile = strrchr(file, '/');
+ if (newfile != NULL)
+ newfile = newfile + 1;
+ else
+ newfile = file;
+
+ va_start(ap, fmt);
+ (void) vsnprintf(msg, sizeof (msg), fmt, ap);
+ va_end(ap);
+
+ printk(KERN_EMERG "%s", msg);
+ printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
+ if (spl_panic_halt)
+ panic("%s", msg);
+
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+
+ /* Unreachable */
+ return (1);
+}
+EXPORT_SYMBOL(spl_panic);
+
+void
+vcmn_err(int ce, const char *fmt, va_list ap)
+{
+ char msg[MAXMSGLEN];
+
+ vsnprintf(msg, MAXMSGLEN - 1, fmt, ap);
+
+ switch (ce) {
+ case CE_IGNORE:
+ break;
+ case CE_CONT:
+ printk("%s", msg);
+ break;
+ case CE_NOTE:
+ printk(KERN_NOTICE "NOTICE: %s\n", msg);
+ break;
+ case CE_WARN:
+ printk(KERN_WARNING "WARNING: %s\n", msg);
+ break;
+ case CE_PANIC:
+ printk(KERN_EMERG "PANIC: %s\n", msg);
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+ }
+} /* vcmn_err() */
+EXPORT_SYMBOL(vcmn_err);
+
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vcmn_err(ce, fmt, ap);
+ va_end(ap);
+} /* cmn_err() */
+EXPORT_SYMBOL(cmn_err);
diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c
new file mode 100644
index 000000000..b38fe254c
--- /dev/null
+++ b/module/spl/spl-generic.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/systeminfo.h>
+#include <sys/vmsystm.h>
+#include <sys/kobj.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/taskq.h>
+#include <sys/tsd.h>
+#include <sys/zmod.h>
+#include <sys/debug.h>
+#include <sys/proc.h>
+#include <sys/kstat.h>
+#include <sys/file.h>
+#include <linux/ctype.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/strings.h>
+#include <linux/kmod.h>
+
+char spl_version[32] = "SPL v" SPL_META_VERSION "-" SPL_META_RELEASE;
+EXPORT_SYMBOL(spl_version);
+
+/* BEGIN CSTYLED */
+unsigned long spl_hostid = 0;
+EXPORT_SYMBOL(spl_hostid);
+module_param(spl_hostid, ulong, 0644);
+MODULE_PARM_DESC(spl_hostid, "The system hostid.");
+/* END CSTYLED */
+
+proc_t p0;
+EXPORT_SYMBOL(p0);
+
+/*
+ * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *
+ * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
+ * is to provide bytes containing random numbers. It is mapped to /dev/urandom
+ * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
+ * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
+ * we can implement it using a fast PRNG that we seed using Linux' actual
+ * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
+ * with an independent seed so that all calls to random_get_pseudo_bytes() are
+ * free of atomic instructions.
+ *
+ * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
+ * to generate words larger than 128 bits will paradoxically be limited to
+ * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
+ * 128-bit words and selecting the first will implicitly select the second. If
+ * a caller finds this behavior undesireable, random_get_bytes() should be used
+ * instead.
+ *
+ * XXX: Linux interrupt handlers that trigger within the critical section
+ * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * see the same numbers. Nothing in the code currently calls this in an
+ * interrupt handler, so this is considered to be okay. If that becomes a
+ * problem, we could create a set of per-cpu variables for interrupt handlers
+ * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
+ * true.
+ */
+static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy);
+
+/*
+ * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
+ * file:
+ *
+ * http://xorshift.di.unimi.it/xorshift128plus.c
+ */
+
+static inline uint64_t
+spl_rand_next(uint64_t *s)
+{
+ uint64_t s1 = s[0];
+ const uint64_t s0 = s[1];
+ s[0] = s0;
+ s1 ^= s1 << 23; // a
+ s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
+ return (s[1] + s0);
+}
+
+static inline void
+spl_rand_jump(uint64_t *s)
+{
+ static const uint64_t JUMP[] =
+ { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+
+ uint64_t s0 = 0;
+ uint64_t s1 = 0;
+ int i, b;
+ for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
+ for (b = 0; b < 64; b++) {
+ if (JUMP[i] & 1ULL << b) {
+ s0 ^= s[0];
+ s1 ^= s[1];
+ }
+ (void) spl_rand_next(s);
+ }
+
+ s[0] = s0;
+ s[1] = s1;
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+ uint64_t *xp, s[2];
+
+ ASSERT(ptr);
+
+ xp = get_cpu_var(spl_pseudo_entropy);
+
+ s[0] = xp[0];
+ s[1] = xp[1];
+
+ while (len) {
+ union {
+ uint64_t ui64;
+ uint8_t byte[sizeof (uint64_t)];
+ }entropy;
+ int i = MIN(len, sizeof (uint64_t));
+
+ len -= i;
+ entropy.ui64 = spl_rand_next(s);
+
+ while (i--)
+ *ptr++ = entropy.byte[i];
+ }
+
+ xp[0] = s[0];
+ xp[1] = s[1];
+
+ put_cpu_var(spl_pseudo_entropy);
+
+ return (0);
+}
+
+
+EXPORT_SYMBOL(random_get_pseudo_bytes);
+
+#if BITS_PER_LONG == 32
+/*
+ * Support 64/64 => 64 division on a 32-bit platform. While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed. There are cases which return incorrect
+ * results as late as linux-2.6.35. Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'. The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+ register int n = 0;
+
+ if (x == 0)
+ return (64);
+
+ if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+ if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+ if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; }
+ if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; }
+ if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; }
+ if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; }
+
+ return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portibility between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+ (void) do_div(u, v);
+ return (u);
+}
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+ uint64_t u0, u1, v1, q0, q1, k;
+ int n;
+
+ if (v >> 32 == 0) { // If v < 2**32:
+ if (u >> 32 < v) { // If u/v cannot overflow,
+ return (__div_u64(u, v)); // just do one division.
+ } else { // If u/v would overflow:
+ u1 = u >> 32; // Break u into two halves.
+ u0 = u & 0xFFFFFFFF;
+ q1 = __div_u64(u1, v); // First quotient digit.
+ k = u1 - q1 * v; // First remainder, < v.
+ u0 += (k << 32);
+ q0 = __div_u64(u0, v); // Seconds quotient digit.
+ return ((q1 << 32) + q0);
+ }
+ } else { // If v >= 2**32:
+ n = nlz64(v); // 0 <= n <= 31.
+ v1 = (v << n) >> 32; // Normalize divisor, MSB is 1.
+ u1 = u >> 1; // To ensure no overflow.
+ q1 = __div_u64(u1, v1); // Get quotient from
+ q0 = (q1 << n) >> 31; // Undo normalization and
+ // division of u by 2.
+ if (q0 != 0) // Make q0 correct or
+ q0 = q0 - 1; // too small by 1.
+ if ((u - q0 * v) >= v)
+ q0 = q0 + 1; // Now q0 is correct.
+
+ return (q0);
+ }
+}
+EXPORT_SYMBOL(__udivdi3);
+
+/* BEGIN CSTYLED */
+#ifndef abs64
+#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+/* END CSTYLED */
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+ int64_t q, t;
+ q = __udivdi3(abs64(u), abs64(v));
+ t = (u ^ v) >> 63; // If u, v have different
+ return ((q ^ t) - t); // signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+ return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+ uint64_t q = __udivdi3(n, d);
+ if (r)
+ *r = n - d * q;
+ return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+ int64_t q, rr;
+ boolean_t nn = B_FALSE;
+ boolean_t nd = B_FALSE;
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0) {
+ nd = B_TRUE;
+ d = -d;
+ }
+
+ q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+ if (nn != nd)
+ q = -q;
+ if (nn)
+ rr = -rr;
+ if (r)
+ *r = rr;
+ return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}. The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return. All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+ uint64_t res;
+ uint64_t mod;
+
+ res = __udivdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+ int64_t res;
+ uint64_t mod;
+
+ res = __divdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+#endif /* BITS_PER_LONG */
+
+/*
+ * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
+ * ddi_strtol(9F) man page. I have not verified the behavior of these
+ * functions against their Solaris counterparts. It is possible that I
+ * may have misinterpreted the man page or the man page is incorrect.
+ */
+int ddi_strtoul(const char *, char **, int, unsigned long *);
+int ddi_strtol(const char *, char **, int, long *);
+int ddi_strtoull(const char *, char **, int, unsigned long long *);
+int ddi_strtoll(const char *, char **, int, long long *);
+
+#define define_ddi_strtoux(type, valtype) \
+int ddi_strtou##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ valtype last_value, value = 0; \
+ char *ptr = (char *)str; \
+ int flag = 1, digit; \
+ \
+ if (strlen(ptr) == 0) \
+ return (EINVAL); \
+ \
+ /* Auto-detect base based on prefix */ \
+ if (!base) { \
+ if (str[0] == '0') { \
+ if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
+ base = 16; /* hex */ \
+ ptr += 2; \
+ } else if (str[1] >= '0' && str[1] < 8) { \
+ base = 8; /* octal */ \
+ ptr += 1; \
+ } else { \
+ return (EINVAL); \
+ } \
+ } else { \
+ base = 10; /* decimal */ \
+ } \
+ } \
+ \
+ while (1) { \
+ if (isdigit(*ptr)) \
+ digit = *ptr - '0'; \
+ else if (isalpha(*ptr)) \
+ digit = tolower(*ptr) - 'a' + 10; \
+ else \
+ break; \
+ \
+ if (digit >= base) \
+ break; \
+ \
+ last_value = value; \
+ value = value * base + digit; \
+ if (last_value > value) /* Overflow */ \
+ return (ERANGE); \
+ \
+ flag = 1; \
+ ptr++; \
+ } \
+ \
+ if (flag) \
+ *result = value; \
+ \
+ if (endptr) \
+ *endptr = (char *)(flag ? ptr : str); \
+ \
+ return (0); \
+} \
+
+#define define_ddi_strtox(type, valtype) \
+int ddi_strto##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ int rc; \
+ \
+ if (*str == '-') { \
+ rc = ddi_strtou##type(str + 1, endptr, base, result); \
+ if (!rc) { \
+ if (*endptr == str + 1) \
+ *endptr = (char *)str; \
+ else \
+ *result = -*result; \
+ } \
+ } else { \
+ rc = ddi_strtou##type(str, endptr, base, result); \
+ } \
+ \
+ return (rc); \
+}
+
+define_ddi_strtoux(l, unsigned long)
+define_ddi_strtox(l, long)
+define_ddi_strtoux(ll, unsigned long long)
+define_ddi_strtox(ll, long long)
+
+EXPORT_SYMBOL(ddi_strtoul);
+EXPORT_SYMBOL(ddi_strtol);
+EXPORT_SYMBOL(ddi_strtoll);
+EXPORT_SYMBOL(ddi_strtoull);
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyin(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyin);
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyout(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyout);
+
+/*
+ * Read the unique system identifier from the /etc/hostid file.
+ *
+ * The behavior of /usr/bin/hostid on Linux systems with the
+ * regular eglibc and coreutils is:
+ *
+ * 1. Generate the value if the /etc/hostid file does not exist
+ * or if the /etc/hostid file is less than four bytes in size.
+ *
+ * 2. If the /etc/hostid file is at least 4 bytes, then return
+ * the first four bytes [0..3] in native endian order.
+ *
+ * 3. Always ignore bytes [4..] if they exist in the file.
+ *
+ * Only the first four bytes are significant, even on systems that
+ * have a 64-bit word size.
+ *
+ * See:
+ *
+ * eglibc: sysdeps/unix/sysv/linux/gethostid.c
+ * coreutils: src/hostid.c
+ *
+ * Notes:
+ *
+ * The /etc/hostid file on Solaris is a text file that often reads:
+ *
+ * # DO NOT EDIT
+ * "0123456789"
+ *
+ * Directly copying this file to Linux results in a constant
+ * hostid of 4f442023 because the default comment constitutes
+ * the first four bytes of the file.
+ *
+ */
+
+char *spl_hostid_path = HW_HOSTID_PATH;
+module_param(spl_hostid_path, charp, 0444);
+MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
+
+static int
+hostid_read(uint32_t *hostid)
+{
+ uint64_t size;
+ struct _buf *file;
+ uint32_t value = 0;
+ int error;
+
+ file = kobj_open_file(spl_hostid_path);
+ if (file == (struct _buf *)-1)
+ return (ENOENT);
+
+ error = kobj_get_filesize(file, &size);
+ if (error) {
+ kobj_close_file(file);
+ return (error);
+ }
+
+ if (size < sizeof (HW_HOSTID_MASK)) {
+ kobj_close_file(file);
+ return (EINVAL);
+ }
+
+ /*
+ * Read directly into the variable like eglibc does.
+ * Short reads are okay; native behavior is preserved.
+ */
+ error = kobj_read_file(file, (char *)&value, sizeof (value), 0);
+ if (error < 0) {
+ kobj_close_file(file);
+ return (EIO);
+ }
+
+ /* Mask down to 32 bits like coreutils does. */
+ *hostid = (value & HW_HOSTID_MASK);
+ kobj_close_file(file);
+
+ return (0);
+}
+
+/*
+ * Return the system hostid. Preferentially use the spl_hostid module option
+ * when set, otherwise use the value in the /etc/hostid file.
+ */
+uint32_t
+zone_get_hostid(void *zone)
+{
+ uint32_t hostid;
+
+ ASSERT3P(zone, ==, NULL);
+
+ if (spl_hostid != 0)
+ return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
+
+ if (hostid_read(&hostid) == 0)
+ return (hostid);
+
+ return (0);
+}
+EXPORT_SYMBOL(zone_get_hostid);
+
+static int
+spl_kvmem_init(void)
+{
+ int rc = 0;
+
+ rc = spl_kmem_init();
+ if (rc)
+ return (rc);
+
+ rc = spl_vmem_init();
+ if (rc) {
+ spl_kmem_fini();
+ return (rc);
+ }
+
+ return (rc);
+}
+
+/*
+ * We initialize the random number generator with 128 bits of entropy from the
+ * system random number generator. In the improbable case that we have a zero
+ * seed, we fallback to the system jiffies, unless it is also zero, in which
+ * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
+ * initialize each of the per-cpu seeds so that the sequences generated on each
+ * CPU are guaranteed to never overlap in practice.
+ */
+static void __init
+spl_random_init(void)
+{
+ uint64_t s[2];
+ int i;
+
+ get_random_bytes(s, sizeof (s));
+
+ if (s[0] == 0 && s[1] == 0) {
+ if (jiffies != 0) {
+ s[0] = jiffies;
+ s[1] = ~0 - jiffies;
+ } else {
+ (void) memcpy(s, "improbable seed", sizeof (s));
+ }
+ printk("SPL: get_random_bytes() returned 0 "
+ "when generating random seed. Setting initial seed to "
+ "0x%016llx%016llx.", cpu_to_be64(s[0]), cpu_to_be64(s[1]));
+ }
+
+ for_each_possible_cpu(i) {
+ uint64_t *wordp = per_cpu(spl_pseudo_entropy, i);
+
+ spl_rand_jump(s);
+
+ wordp[0] = s[0];
+ wordp[1] = s[1];
+ }
+}
+
+static void
+spl_kvmem_fini(void)
+{
+ spl_vmem_fini();
+ spl_kmem_fini();
+}
+
+static int __init
+spl_init(void)
+{
+ int rc = 0;
+
+ bzero(&p0, sizeof (proc_t));
+ spl_random_init();
+
+ if ((rc = spl_kvmem_init()))
+ goto out1;
+
+ if ((rc = spl_mutex_init()))
+ goto out2;
+
+ if ((rc = spl_rw_init()))
+ goto out3;
+
+ if ((rc = spl_tsd_init()))
+ goto out4;
+
+ if ((rc = spl_taskq_init()))
+ goto out5;
+
+ if ((rc = spl_kmem_cache_init()))
+ goto out6;
+
+ if ((rc = spl_vn_init()))
+ goto out7;
+
+ if ((rc = spl_proc_init()))
+ goto out8;
+
+ if ((rc = spl_kstat_init()))
+ goto out9;
+
+ if ((rc = spl_zlib_init()))
+ goto out10;
+
+ printk(KERN_NOTICE "SPL: Loaded module v%s-%s%s\n", SPL_META_VERSION,
+ SPL_META_RELEASE, SPL_DEBUG_STR);
+ return (rc);
+
+out10:
+ spl_kstat_fini();
+out9:
+ spl_proc_fini();
+out8:
+ spl_vn_fini();
+out7:
+ spl_kmem_cache_fini();
+out6:
+ spl_taskq_fini();
+out5:
+ spl_tsd_fini();
+out4:
+ spl_rw_fini();
+out3:
+ spl_mutex_fini();
+out2:
+ spl_kvmem_fini();
+out1:
+ printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer "
+ "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE,
+ SPL_DEBUG_STR, rc);
+
+ return (rc);
+}
+
+static void __exit
+spl_fini(void)
+{
+ printk(KERN_NOTICE "SPL: Unloaded module v%s-%s%s\n",
+ SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR);
+ spl_zlib_fini();
+ spl_kstat_fini();
+ spl_proc_fini();
+ spl_vn_fini();
+ spl_kmem_cache_fini();
+ spl_taskq_fini();
+ spl_tsd_fini();
+ spl_rw_fini();
+ spl_mutex_fini();
+ spl_kvmem_fini();
+}
+
+module_init(spl_init);
+module_exit(spl_fini);
+
+MODULE_DESCRIPTION("Solaris Porting Layer");
+MODULE_AUTHOR(SPL_META_AUTHOR);
+MODULE_LICENSE(SPL_META_LICENSE);
+MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE);
diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c
new file mode 100644
index 000000000..5492c6a46
--- /dev/null
+++ b/module/spl/spl-kmem-cache.c
@@ -0,0 +1,1769 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <sys/taskq.h>
+#include <sys/timer.h>
+#include <sys/vmem.h>
+#include <sys/wait.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/prefetch.h>
+
+/*
+ * Within the scope of spl-kmem.c file the kmem_cache_* definitions
+ * are removed to allow access to the real Linux slab allocator.
+ */
+#undef kmem_cache_destroy
+#undef kmem_cache_create
+#undef kmem_cache_alloc
+#undef kmem_cache_free
+
+
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
+/*
+ * Cache expiration was implemented because it was part of the default Solaris
+ * kmem_cache behavior. The idea is that per-cpu objects which haven't been
+ * accessed in several seconds should be returned to the cache. On the other
+ * hand Linux slabs never move objects back to the slabs unless there is
+ * memory pressure on the system. By default the Linux method is enabled
+ * because it has been shown to improve responsiveness on low memory systems.
+ * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
+EXPORT_SYMBOL(spl_kmem_cache_expire);
+module_param(spl_kmem_cache_expire, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
+
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory. They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches. However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size. When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size. Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+ "Default magazine size (2-256), set automatically (0)");
+
+/*
+ * The default behavior is to report the number of objects remaining in the
+ * cache. This allows the Linux VM to repeatedly reclaim objects from the
+ * cache when memory is low satisfy other memory allocations. Alternately,
+ * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
+ * is reclaimed. This may increase the likelihood of out of memory events.
+ */
+unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
+module_param(spl_kmem_cache_reclaim, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
+
+unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
+
+unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
+module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
+ "Minimal number of objects per slab");
+
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
+module_param(spl_kmem_cache_max_size, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
+
+/*
+ * For small objects the Linux slab allocator should be used to make the most
+ * efficient use of the memory. However, large objects are not supported by
+ * the Linux slab and therefore the SPL implementation is preferred. A cutoff
+ * of 16K was determined to be optimal for architectures using 4K pages.
+ */
+#if PAGE_SIZE == 4096
+unsigned int spl_kmem_cache_slab_limit = 16384;
+#else
+unsigned int spl_kmem_cache_slab_limit = 0;
+#endif
+module_param(spl_kmem_cache_slab_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
+ "Objects less than N bytes use the Linux slab");
+
+/*
+ * This value defaults to a threshold designed to avoid allocations which
+ * have been deemed costly by the kernel.
+ */
+unsigned int spl_kmem_cache_kmem_limit =
+ ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
+ SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_kmem_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
+ "Objects less than N bytes use the kmalloc");
+
+/*
+ * The number of threads available to allocate new slabs for caches. This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+ "Number of spl_kmem_cache threads");
+/* END CSTYLED */
+
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implementation I cannot use it to emulate the Solaris APIs. I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors. Recent versions of the Linux
+ * kernel have removed support for destructors. This is a deal
+ * breaker for the SPL which contains particularly expensive
+ * initializers for mutex's, condition variables, etc. We also
+ * require a minimal level of cleanup for these data types unlike
+ * many Linux data types which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab. Callers of the Solaris slab
+ * expect it to work well for both small are very large allocations.
+ * Because of memory fragmentation the Linux slab which is backed
+ * by kmalloc'ed memory performs very badly when confronted with
+ * large numbers of large allocations. Basing the slab on the
+ * virtual address space removes the need for contiguous pages
+ * and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features. It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches. This will seriously
+ * constrain the size of the slab caches and their performance.
+ */
+
+struct list_head spl_kmem_cache_list; /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
+
+static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
+
+SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
+SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
+ spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
+
+static void *
+kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ ptr = (void *)__get_free_pages(lflags, get_order(size));
+ } else {
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+ }
+
+ /* Resulting allocated memory will be page aligned */
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ return (ptr);
+}
+
+static void
+kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
+{
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ /*
+ * The Linux direct reclaim path uses this out of band value to
+ * determine if forward progress is being made. Normally this is
+ * incremented by kmem_freepages() which is part of the various
+ * Linux slab implementations. However, since we are using none
+ * of that infrastructure we are responsible for incrementing it.
+ */
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ free_pages((unsigned long)ptr, get_order(size));
+ } else {
+ vfree(ptr);
+ }
+}
+
+/*
+ * Required space for each aligned sks.
+ */
+static inline uint32_t
+spl_sks_size(spl_kmem_cache_t *skc)
+{
+ return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each aligned object.
+ */
+static inline uint32_t
+spl_obj_size(spl_kmem_cache_t *skc)
+{
+ uint32_t align = skc->skc_obj_align;
+
+ return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+ P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
+}
+
+/*
+ * Lookup the spl_kmem_object_t for an object given that object.
+ */
+static inline spl_kmem_obj_t *
+spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
+{
+ return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each offslab object taking in to account alignment
+ * restrictions and the power-of-two requirement of kv_alloc().
+ */
+static inline uint32_t
+spl_offslab_size(spl_kmem_cache_t *skc)
+{
+ return (1UL << (fls64(spl_obj_size(skc)) + 1));
+}
+
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
+ * actual objects in to one large address space to minimize the number
+ * of calls to the allocator. It is far better to do a few large
+ * allocations and then subdivide it ourselves. Now which allocator
+ * we use requires balancing a few trade offs.
+ *
+ * For small objects we use kmem_alloc() because as long as you are
+ * only requesting a small number of pages (ideally just one) its cheap.
+ * However, when you start requesting multiple pages with kmem_alloc()
+ * it gets increasingly expensive since it requires contiguous pages.
+ * For this reason we shift to vmem_alloc() for slabs of large objects
+ * which removes the need for contiguous pages. We do not use
+ * vmem_alloc() in all cases because there is significant locking
+ * overhead in __get_vm_area_node(). This function takes a single
+ * global lock when acquiring an available virtual address range which
+ * serializes all vmem_alloc()'s for all slab caches. Using slightly
+ * different allocation functions for small and large objects should
+ * give us the best of both worlds.
+ *
+ * KMC_ONSLAB KMC_OFFSLAB
+ *
+ * +------------------------+ +-----------------+
+ * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
+ * | skc_obj_size <-+ | | +-----------------+ | |
+ * | spl_kmem_obj_t | | | |
+ * | skc_obj_size <---+ | +-----------------+ | |
+ * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
+ * | ... v | | spl_kmem_obj_t | |
+ * +------------------------+ +-----------------+ v
+ */
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+ spl_kmem_obj_t *sko, *n;
+ void *base, *obj;
+ uint32_t obj_size, offslab_size = 0;
+ int i, rc = 0;
+
+ base = kv_alloc(skc, skc->skc_slab_size, flags);
+ if (base == NULL)
+ return (NULL);
+
+ sks = (spl_kmem_slab_t *)base;
+ sks->sks_magic = SKS_MAGIC;
+ sks->sks_objs = skc->skc_slab_objs;
+ sks->sks_age = jiffies;
+ sks->sks_cache = skc;
+ INIT_LIST_HEAD(&sks->sks_list);
+ INIT_LIST_HEAD(&sks->sks_free_list);
+ sks->sks_ref = 0;
+ obj_size = spl_obj_size(skc);
+
+ if (skc->skc_flags & KMC_OFFSLAB)
+ offslab_size = spl_offslab_size(skc);
+
+ for (i = 0; i < sks->sks_objs; i++) {
+ if (skc->skc_flags & KMC_OFFSLAB) {
+ obj = kv_alloc(skc, offslab_size, flags);
+ if (!obj) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ } else {
+ obj = base + spl_sks_size(skc) + (i * obj_size);
+ }
+
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+ sko = spl_sko_from_obj(skc, obj);
+ sko->sko_addr = obj;
+ sko->sko_magic = SKO_MAGIC;
+ sko->sko_slab = sks;
+ INIT_LIST_HEAD(&sko->sko_list);
+ list_add_tail(&sko->sko_list, &sks->sks_free_list);
+ }
+
+out:
+ if (rc) {
+ if (skc->skc_flags & KMC_OFFSLAB)
+ list_for_each_entry_safe(sko,
+ n, &sks->sks_free_list, sko_list) {
+ kv_free(skc, sko->sko_addr, offslab_size);
+ }
+
+ kv_free(skc, base, skc->skc_slab_size);
+ sks = NULL;
+ }
+
+ return (sks);
+}
+
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
+ */
+static void
+spl_slab_free(spl_kmem_slab_t *sks,
+ struct list_head *sks_list, struct list_head *sko_list)
+{
+ spl_kmem_cache_t *skc;
+
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref == 0);
+
+ skc = sks->sks_cache;
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Update slab/objects counters in the cache, then remove the
+ * slab from the skc->skc_partial_list. Finally add the slab
+ * and all its objects in to the private work lists where the
+ * destructors will be called and the memory freed to the system.
+ */
+ skc->skc_obj_total -= sks->sks_objs;
+ skc->skc_slab_total--;
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, sks_list);
+ list_splice_init(&sks->sks_free_list, sko_list);
+}
+
+/*
+ * Reclaim empty slabs at the end of the partial list.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc)
+{
+ spl_kmem_slab_t *sks, *m;
+ spl_kmem_obj_t *sko, *n;
+ LIST_HEAD(sks_list);
+ LIST_HEAD(sko_list);
+ uint32_t size = 0;
+
+ /*
+ * Empty slabs and objects must be moved to a private list so they
+ * can be safely freed outside the spin lock. All empty slabs are
+ * at the end of skc->skc_partial_list, therefore once a non-empty
+ * slab is found we can stop scanning.
+ */
+ spin_lock(&skc->skc_lock);
+ list_for_each_entry_safe_reverse(sks, m,
+ &skc->skc_partial_list, sks_list) {
+
+ if (sks->sks_ref > 0)
+ break;
+
+ spl_slab_free(sks, &sks_list, &sko_list);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ /*
+ * The following two loops ensure all the object destructors are
+ * run, any offslab objects are freed, and the slabs themselves
+ * are freed. This is all done outside the skc->skc_lock since
+ * this allows the destructor to sleep, and allows us to perform
+ * a conditional reschedule when a freeing a large number of
+ * objects and slabs back to the system.
+ */
+ if (skc->skc_flags & KMC_OFFSLAB)
+ size = spl_offslab_size(skc);
+
+ list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+
+ if (skc->skc_flags & KMC_OFFSLAB)
+ kv_free(skc, sko->sko_addr, size);
+ }
+
+ list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ kv_free(skc, sks, skc->skc_slab_size);
+ }
+}
+
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+ struct rb_node *node = root->rb_node;
+ spl_kmem_emergency_t *ske;
+ unsigned long address = (unsigned long)obj;
+
+ while (node) {
+ ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+ if (address < ske->ske_obj)
+ node = node->rb_left;
+ else if (address > ske->ske_obj)
+ node = node->rb_right;
+ else
+ return (ske);
+ }
+
+ return (NULL);
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ spl_kmem_emergency_t *ske_tmp;
+ unsigned long address = ske->ske_obj;
+
+ while (*new) {
+ ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+ parent = *new;
+ if (address < ske_tmp->ske_obj)
+ new = &((*new)->rb_left);
+ else if (address > ske_tmp->ske_obj)
+ new = &((*new)->rb_right);
+ else
+ return (0);
+ }
+
+ rb_link_node(&ske->ske_node, parent, new);
+ rb_insert_color(&ske->ske_node, root);
+
+ return (1);
+}
+
+/*
+ * Allocate a single emergency object and track it in a red black tree.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+ int empty;
+
+ /* Last chance use a partial slab if one now exists */
+ spin_lock(&skc->skc_lock);
+ empty = list_empty(&skc->skc_partial_list);
+ spin_unlock(&skc->skc_lock);
+ if (!empty)
+ return (-EEXIST);
+
+ ske = kmalloc(sizeof (*ske), lflags);
+ if (ske == NULL)
+ return (-ENOMEM);
+
+ ske->ske_obj = __get_free_pages(lflags, order);
+ if (ske->ske_obj == 0) {
+ kfree(ske);
+ return (-ENOMEM);
+ }
+
+ spin_lock(&skc->skc_lock);
+ empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+ if (likely(empty)) {
+ skc->skc_obj_total++;
+ skc->skc_obj_emergency++;
+ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+ skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (unlikely(!empty)) {
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+ return (-EINVAL);
+ }
+
+ *obj = (void *)ske->ske_obj;
+
+ return (0);
+}
+
+/*
+ * Locate the passed object in the red black tree and free it.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+
+ spin_lock(&skc->skc_lock);
+ ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+ if (ske) {
+ rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+ skc->skc_obj_emergency--;
+ skc->skc_obj_total--;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (ske == NULL)
+ return (-ENOENT);
+
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+
+ return (0);
+}
+
+/*
+ * Release objects from the per-cpu magazine back to their slab. The flush
+ * argument contains the max number of entries to remove from the magazine.
+ */
+static void
+__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ int i, count = MIN(flush, skm->skm_avail);
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ for (i = 0; i < count; i++)
+ spl_cache_shrink(skc, skm->skm_objs[i]);
+
+ skm->skm_avail -= count;
+ memmove(skm->skm_objs, &(skm->skm_objs[count]),
+ sizeof (void *) * skm->skm_avail);
+}
+
+static void
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ spin_lock(&skc->skc_lock);
+ __spl_cache_flush(skc, skm, flush);
+ spin_unlock(&skc->skc_lock);
+}
+
+static void
+spl_magazine_age(void *data)
+{
+ spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+ spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_cpu == smp_processor_id());
+ ASSERT(irqs_disabled());
+
+ /* There are no available objects or they are too young to age out */
+ if ((skm->skm_avail == 0) ||
+ time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
+ return;
+
+ /*
+ * Because we're executing in interrupt context we may have
+ * interrupted the holder of this lock. To avoid a potential
+ * deadlock return if the lock is contended.
+ */
+ if (!spin_trylock(&skc->skc_lock))
+ return;
+
+ __spl_cache_flush(skc, skm, skm->skm_refill);
+ spin_unlock(&skc->skc_lock);
+}
+
+/*
+ * Called regularly to keep a downward pressure on the cache.
+ *
+ * Objects older than skc->skc_delay seconds in the per-cpu magazines will
+ * be returned to the caches. This is done to prevent idle magazines from
+ * holding memory which could be better used elsewhere. The delay is
+ * present to prevent thrashing the magazine.
+ *
+ * The newly released objects may result in empty partial slabs. Those
+ * slabs should be released to the system. Otherwise moving the objects
+ * out of the magazines is just wasted work.
+ */
+static void
+spl_cache_age(void *data)
+{
+ spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+ taskqid_t id = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /* Dynamically disabled at run time */
+ if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
+ return;
+
+ atomic_inc(&skc->skc_ref);
+
+ if (!(skc->skc_flags & KMC_NOMAGAZINE))
+ on_each_cpu(spl_magazine_age, skc, 1);
+
+ spl_slab_reclaim(skc);
+
+ while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
+ id = taskq_dispatch_delay(
+ spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
+ ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+ /* Destroy issued after dispatch immediately cancel it */
+ if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+ }
+
+ spin_lock(&skc->skc_lock);
+ skc->skc_taskqid = id;
+ spin_unlock(&skc->skc_lock);
+
+ atomic_dec(&skc->skc_ref);
+}
+
+/*
+ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
+ * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page. Also for
+ * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
+ * lower than this and we will fail.
+ */
+static int
+spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
+{
+ uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
+
+ if (skc->skc_flags & KMC_OFFSLAB) {
+ tgt_objs = spl_kmem_cache_obj_per_slab;
+ tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
+
+ if ((skc->skc_flags & KMC_KMEM) &&
+ (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE)))
+ return (-ENOSPC);
+ } else {
+ sks_size = spl_sks_size(skc);
+ obj_size = spl_obj_size(skc);
+ max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+ tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
+
+ /*
+ * KMC_KMEM slabs are allocated by __get_free_pages() which
+ * rounds up to the nearest order. Knowing this the size
+ * should be rounded up to the next power of two with a hard
+ * maximum defined by the maximum allowed allocation order.
+ */
+ if (skc->skc_flags & KMC_KMEM) {
+ max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
+ tgt_size = MIN(max_size,
+ PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
+ }
+
+ if (tgt_size <= max_size) {
+ tgt_objs = (tgt_size - sks_size) / obj_size;
+ } else {
+ tgt_objs = (max_size - sks_size) / obj_size;
+ tgt_size = (tgt_objs * obj_size) + sks_size;
+ }
+ }
+
+ if (tgt_objs == 0)
+ return (-ENOSPC);
+
+ *objs = tgt_objs;
+ *size = tgt_size;
+
+ return (0);
+}
+
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine. Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+ uint32_t obj_size = spl_obj_size(skc);
+ int size;
+
+ if (spl_kmem_cache_magazine_size > 0)
+ return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
+ /* Per-magazine sizes below assume a 4Kib page size */
+ if (obj_size > (PAGE_SIZE * 256))
+ size = 4; /* Minimum 4Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE * 32))
+ size = 16; /* Minimum 2Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE))
+ size = 64; /* Minimum 256Kib per-magazine */
+ else if (obj_size > (PAGE_SIZE / 4))
+ size = 128; /* Minimum 128Kib per-magazine */
+ else
+ size = 256;
+
+ return (size);
+}
+
+/*
+ * Allocate a per-cpu magazine to associate with a specific core.
+ */
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
+{
+ spl_kmem_magazine_t *skm;
+ int size = sizeof (spl_kmem_magazine_t) +
+ sizeof (void *) * skc->skc_mag_size;
+
+ skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (skm) {
+ skm->skm_magic = SKM_MAGIC;
+ skm->skm_avail = 0;
+ skm->skm_size = skc->skc_mag_size;
+ skm->skm_refill = skc->skc_mag_refill;
+ skm->skm_cache = skc;
+ skm->skm_age = jiffies;
+ skm->skm_cpu = cpu;
+ }
+
+ return (skm);
+}
+
+/*
+ * Free a per-cpu magazine associated with a specific core.
+ */
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_avail == 0);
+ kfree(skm);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+ int i;
+
+ if (skc->skc_flags & KMC_NOMAGAZINE)
+ return (0);
+
+ skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+ num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
+ skc->skc_mag_size = spl_magazine_size(skc);
+ skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+
+ for_each_possible_cpu(i) {
+ skc->skc_mag[i] = spl_magazine_alloc(skc, i);
+ if (!skc->skc_mag[i]) {
+ for (i--; i >= 0; i--)
+ spl_magazine_free(skc->skc_mag[i]);
+
+ kfree(skc->skc_mag);
+ return (-ENOMEM);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+ spl_kmem_magazine_t *skm;
+ int i;
+
+ if (skc->skc_flags & KMC_NOMAGAZINE)
+ return;
+
+ for_each_possible_cpu(i) {
+ skm = skc->skc_mag[i];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ spl_magazine_free(skm);
+ }
+
+ kfree(skc->skc_mag);
+}
+
+/*
+ * Create a object cache based on the following arguments:
+ * name cache name
+ * size cache object size
+ * align cache object alignment
+ * ctor cache object constructor
+ * dtor cache object destructor
+ * reclaim cache object reclaim
+ * priv cache private data for ctor/dtor/reclaim
+ * vmp unused must be NULL
+ * flags
+ * KMC_NOTOUCH Disable cache object aging (unsupported)
+ * KMC_NODEBUG Disable debugging (unsupported)
+ * KMC_NOHASH Disable hashing (unsupported)
+ * KMC_QCACHE Disable qcache (unsupported)
+ * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
+ * KMC_KMEM Force kmem backed cache
+ * KMC_VMEM Force vmem backed cache
+ * KMC_SLAB Force Linux slab backed cache
+ * KMC_OFFSLAB Locate objects off the slab
+ */
+spl_kmem_cache_t *
+spl_kmem_cache_create(char *name, size_t size, size_t align,
+ spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
+ void *priv, void *vmp, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+ spl_kmem_cache_t *skc;
+ int rc;
+
+ /*
+ * Unsupported flags
+ */
+ ASSERT0(flags & KMC_NOMAGAZINE);
+ ASSERT0(flags & KMC_NOHASH);
+ ASSERT0(flags & KMC_QCACHE);
+ ASSERT(vmp == NULL);
+
+ might_sleep();
+
+ skc = kzalloc(sizeof (*skc), lflags);
+ if (skc == NULL)
+ return (NULL);
+
+ skc->skc_magic = SKC_MAGIC;
+ skc->skc_name_size = strlen(name) + 1;
+ skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+ if (skc->skc_name == NULL) {
+ kfree(skc);
+ return (NULL);
+ }
+ strncpy(skc->skc_name, name, skc->skc_name_size);
+
+ skc->skc_ctor = ctor;
+ skc->skc_dtor = dtor;
+ skc->skc_reclaim = reclaim;
+ skc->skc_private = priv;
+ skc->skc_vmp = vmp;
+ skc->skc_linux_cache = NULL;
+ skc->skc_flags = flags;
+ skc->skc_obj_size = size;
+ skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
+ skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+ skc->skc_reap = SPL_KMEM_CACHE_REAP;
+ atomic_set(&skc->skc_ref, 0);
+
+ INIT_LIST_HEAD(&skc->skc_list);
+ INIT_LIST_HEAD(&skc->skc_complete_list);
+ INIT_LIST_HEAD(&skc->skc_partial_list);
+ skc->skc_emergency_tree = RB_ROOT;
+ spin_lock_init(&skc->skc_lock);
+ init_waitqueue_head(&skc->skc_waitq);
+ skc->skc_slab_fail = 0;
+ skc->skc_slab_create = 0;
+ skc->skc_slab_destroy = 0;
+ skc->skc_slab_total = 0;
+ skc->skc_slab_alloc = 0;
+ skc->skc_slab_max = 0;
+ skc->skc_obj_total = 0;
+ skc->skc_obj_alloc = 0;
+ skc->skc_obj_max = 0;
+ skc->skc_obj_deadlock = 0;
+ skc->skc_obj_emergency = 0;
+ skc->skc_obj_emergency_max = 0;
+
+ /*
+ * Verify the requested alignment restriction is sane.
+ */
+ if (align) {
+ VERIFY(ISP2(align));
+ VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
+ VERIFY3U(align, <=, PAGE_SIZE);
+ skc->skc_obj_align = align;
+ }
+
+ /*
+ * When no specific type of slab is requested (kmem, vmem, or
+ * linuxslab) then select a cache type based on the object size
+ * and default tunables.
+ */
+ if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
+
+ /*
+ * Objects smaller than spl_kmem_cache_slab_limit can
+ * use the Linux slab for better space-efficiency. By
+ * default this functionality is disabled until its
+ * performance characteristics are fully understood.
+ */
+ if (spl_kmem_cache_slab_limit &&
+ size <= (size_t)spl_kmem_cache_slab_limit)
+ skc->skc_flags |= KMC_SLAB;
+
+ /*
+ * Small objects, less than spl_kmem_cache_kmem_limit per
+ * object should use kmem because their slabs are small.
+ */
+ else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
+ skc->skc_flags |= KMC_KMEM;
+
+ /*
+ * All other objects are considered large and are placed
+ * on vmem backed slabs.
+ */
+ else
+ skc->skc_flags |= KMC_VMEM;
+ }
+
+ /*
+ * Given the type of slab allocate the required resources.
+ */
+ if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
+ rc = spl_slab_size(skc,
+ &skc->skc_slab_objs, &skc->skc_slab_size);
+ if (rc)
+ goto out;
+
+ rc = spl_magazine_create(skc);
+ if (rc)
+ goto out;
+ } else {
+ unsigned long slabflags = 0;
+
+ if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+ rc = EINVAL;
+ goto out;
+ }
+
+#if defined(SLAB_USERCOPY)
+ /*
+ * Required for PAX-enabled kernels if the slab is to be
+ * used for coping between user and kernel space.
+ */
+ slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+ /*
+ * Newer grsec patchset uses kmem_cache_create_usercopy()
+ * instead of SLAB_USERCOPY flag
+ */
+ skc->skc_linux_cache = kmem_cache_create_usercopy(
+ skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+ skc->skc_linux_cache = kmem_cache_create(
+ skc->skc_name, size, align, slabflags, NULL);
+#endif
+ if (skc->skc_linux_cache == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+
+#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
+ skc->skc_linux_cache->allocflags |= __GFP_COMP;
+#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
+ skc->skc_linux_cache->gfpflags |= __GFP_COMP;
+#endif
+ skc->skc_flags |= KMC_NOMAGAZINE;
+ }
+
+ if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
+ skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
+ spl_cache_age, skc, TQ_SLEEP,
+ ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+ down_write(&spl_kmem_cache_sem);
+ list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
+ up_write(&spl_kmem_cache_sem);
+
+ return (skc);
+out:
+ kfree(skc->skc_name);
+ kfree(skc);
+ return (NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_create);
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
+ kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+ ASSERT(move != NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_set_move);
+
+/*
+ * Destroy a cache and all objects associated with the cache.
+ */
+void
+spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
+{
+ DECLARE_WAIT_QUEUE_HEAD(wq);
+ taskqid_t id;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
+
+ down_write(&spl_kmem_cache_sem);
+ list_del_init(&skc->skc_list);
+ up_write(&spl_kmem_cache_sem);
+
+ /* Cancel any and wait for any pending delayed tasks */
+ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ spin_lock(&skc->skc_lock);
+ id = skc->skc_taskqid;
+ spin_unlock(&skc->skc_lock);
+
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+
+ /*
+ * Wait until all current callers complete, this is mainly
+ * to catch the case where a low memory situation triggers a
+ * cache reaping action which races with this destroy.
+ */
+ wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
+ if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
+ spl_magazine_destroy(skc);
+ spl_slab_reclaim(skc);
+ } else {
+ ASSERT(skc->skc_flags & KMC_SLAB);
+ kmem_cache_destroy(skc->skc_linux_cache);
+ }
+
+ spin_lock(&skc->skc_lock);
+
+ /*
+ * Validate there are no objects in use and free all the
+ * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+ */
+ ASSERT3U(skc->skc_slab_alloc, ==, 0);
+ ASSERT3U(skc->skc_obj_alloc, ==, 0);
+ ASSERT3U(skc->skc_slab_total, ==, 0);
+ ASSERT3U(skc->skc_obj_total, ==, 0);
+ ASSERT3U(skc->skc_obj_emergency, ==, 0);
+ ASSERT(list_empty(&skc->skc_complete_list));
+
+ spin_unlock(&skc->skc_lock);
+
+ kfree(skc->skc_name);
+ kfree(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_destroy);
+
+/*
+ * Allocate an object from a slab attached to the cache. This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+ spl_kmem_obj_t *sko;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+
+ sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ ASSERT(sko->sko_addr != NULL);
+
+ /* Remove from sks_free_list */
+ list_del_init(&sko->sko_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref++;
+ skc->skc_obj_alloc++;
+
+ /* Track max obj usage statistics */
+ if (skc->skc_obj_alloc > skc->skc_obj_max)
+ skc->skc_obj_max = skc->skc_obj_alloc;
+
+ /* Track max slab usage statistics */
+ if (sks->sks_ref == 1) {
+ skc->skc_slab_alloc++;
+
+ if (skc->skc_slab_alloc > skc->skc_slab_max)
+ skc->skc_slab_max = skc->skc_slab_alloc;
+ }
+
+ return (sko->sko_addr);
+}
+
+/*
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
+ */
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ sks = spl_slab_alloc(skc, flags);
+ spl_fstrans_unmark(cookie);
+
+ spin_lock(&skc->skc_lock);
+ if (sks) {
+ skc->skc_slab_total++;
+ skc->skc_obj_total += sks->sks_objs;
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+ spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+ spl_kmem_cache_t *skc = ska->ska_cache;
+
+ (void) __spl_cache_grow(skc, ska->ska_flags);
+
+ atomic_dec(&skc->skc_ref);
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+
+ kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+ return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
+}
+
+/*
+ * No available objects on any slabs, create a new slab. Note that this
+ * functionality is disabled for KMC_SLAB caches which are backed by the
+ * Linux slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ int remaining, rc = 0;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ might_sleep();
+ *obj = NULL;
+
+ /*
+ * Before allocating a new slab wait for any reaping to complete and
+ * then return so the local magazine can be rechecked for new objects.
+ */
+ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+ rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+ TASK_UNINTERRUPTIBLE);
+ return (rc ? rc : -EAGAIN);
+ }
+
+ /*
+ * To reduce the overhead of context switch and improve NUMA locality,
+ * it tries to allocate a new slab in the current process context with
+ * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
+ * allocation.
+ *
+ * However, this can't be applied to KVM_VMEM due to a bug that
+ * __vmalloc() doesn't honor gfp flags in page table allocation.
+ */
+ if (!(skc->skc_flags & KMC_VMEM)) {
+ rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
+ if (rc == 0)
+ return (0);
+ }
+
+ /*
+ * This is handled by dispatching a work request to the global work
+ * queue. This allows us to asynchronously allocate a new slab while
+ * retaining the ability to safely fall back to a smaller synchronous
+ * allocations to ensure forward progress is always maintained.
+ */
+ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+ spl_kmem_alloc_t *ska;
+
+ ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
+ if (ska == NULL) {
+ clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ return (-ENOMEM);
+ }
+
+ atomic_inc(&skc->skc_ref);
+ ska->ska_cache = skc;
+ ska->ska_flags = flags;
+ taskq_init_ent(&ska->ska_tqe);
+ taskq_dispatch_ent(spl_kmem_cache_taskq,
+ spl_cache_grow_work, ska, 0, &ska->ska_tqe);
+ }
+
+ /*
+ * The goal here is to only detect the rare case where a virtual slab
+ * allocation has deadlocked. We must be careful to minimize the use
+ * of emergency objects which are more expensive to track. Therefore,
+ * we set a very long timeout for the asynchronous allocation and if
+ * the timeout is reached the cache is flagged as deadlocked. From
+ * this point only new emergency objects will be allocated until the
+ * asynchronous allocation completes and clears the deadlocked flag.
+ */
+ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+ rc = spl_emergency_alloc(skc, flags, obj);
+ } else {
+ remaining = wait_event_timeout(skc->skc_waitq,
+ spl_cache_grow_wait(skc), HZ / 10);
+
+ if (!remaining) {
+ spin_lock(&skc->skc_lock);
+ if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+ set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ skc->skc_obj_deadlock++;
+ }
+ spin_unlock(&skc->skc_lock);
+ }
+
+ rc = -ENOMEM;
+ }
+
+ return (rc);
+}
+
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created. On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
+ */
+static void *
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+ spl_kmem_slab_t *sks;
+ int count = 0, rc, refill;
+ void *obj = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
+ spin_lock(&skc->skc_lock);
+
+ while (refill > 0) {
+ /* No slabs available we may need to grow the cache */
+ if (list_empty(&skc->skc_partial_list)) {
+ spin_unlock(&skc->skc_lock);
+
+ local_irq_enable();
+ rc = spl_cache_grow(skc, flags, &obj);
+ local_irq_disable();
+
+ /* Emergency object for immediate use by caller */
+ if (rc == 0 && obj != NULL)
+ return (obj);
+
+ if (rc)
+ goto out;
+
+ /* Rescheduled to different CPU skm is not local */
+ if (skm != skc->skc_mag[smp_processor_id()])
+ goto out;
+
+ /*
+ * Potentially rescheduled to the same CPU but
+ * allocations may have occurred from this CPU while
+ * we were sleeping so recalculate max refill.
+ */
+ refill = MIN(refill, skm->skm_size - skm->skm_avail);
+
+ spin_lock(&skc->skc_lock);
+ continue;
+ }
+
+ /* Grab the next available slab */
+ sks = list_entry((&skc->skc_partial_list)->next,
+ spl_kmem_slab_t, sks_list);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref < sks->sks_objs);
+ ASSERT(!list_empty(&sks->sks_free_list));
+
+ /*
+ * Consume as many objects as needed to refill the requested
+ * cache. We must also be careful not to overfill it.
+ */
+ while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+ ++count) {
+ ASSERT(skm->skm_avail < skm->skm_size);
+ ASSERT(count < skm->skm_size);
+ skm->skm_objs[skm->skm_avail++] =
+ spl_cache_obj(skc, sks);
+ }
+
+ /* Move slab to skc_complete_list when full */
+ if (sks->sks_ref == sks->sks_objs) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_complete_list);
+ }
+ }
+
+ spin_unlock(&skc->skc_lock);
+out:
+ return (NULL);
+}
+
+/*
+ * Release an object back to the slab from which it came.
+ */
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_slab_t *sks = NULL;
+ spl_kmem_obj_t *sko = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ sko = spl_sko_from_obj(skc, obj);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ sks = sko->sko_slab;
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_cache == skc);
+ list_add(&sko->sko_list, &sks->sks_free_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref--;
+ skc->skc_obj_alloc--;
+
+ /*
+ * Move slab to skc_partial_list when no longer full. Slabs
+ * are added to the head to keep the partial list is quasi-full
+ * sorted order. Fuller at the head, emptier at the tail.
+ */
+ if (sks->sks_ref == (sks->sks_objs - 1)) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_partial_list);
+ }
+
+ /*
+ * Move empty slabs to the end of the partial list so
+ * they can be easily found and freed during reclamation.
+ */
+ if (sks->sks_ref == 0) {
+ list_del(&sks->sks_list);
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+ skc->skc_slab_alloc--;
+ }
+}
+
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_magazine_t *skm;
+ void *obj = NULL;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Allocate directly from a Linux slab. All optimizations are left
+ * to the underlying cache we only need to guarantee that KM_SLEEP
+ * callers will never fail.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ struct kmem_cache *slc = skc->skc_linux_cache;
+ do {
+ obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
+ } while ((obj == NULL) && !(flags & KM_NOSLEEP));
+
+ goto ret;
+ }
+
+ local_irq_disable();
+
+restart:
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * in the restart case we must be careful to reacquire
+ * the local magazine since this may have changed
+ * when we need to grow the cache.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ if (likely(skm->skm_avail)) {
+ /* Object available in CPU cache, use it */
+ obj = skm->skm_objs[--skm->skm_avail];
+ skm->skm_age = jiffies;
+ } else {
+ obj = spl_cache_refill(skc, skm, flags);
+ if ((obj == NULL) && !(flags & KM_NOSLEEP))
+ goto restart;
+
+ local_irq_enable();
+ goto ret;
+ }
+
+ local_irq_enable();
+ ASSERT(obj);
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+
+ret:
+ /* Pre-emptively migrate object to CPU L1 cache */
+ if (obj) {
+ if (obj && skc->skc_ctor)
+ skc->skc_ctor(obj, skc->skc_private, flags);
+ else
+ prefetchw(obj);
+ }
+
+ return (obj);
+}
+EXPORT_SYMBOL(spl_kmem_cache_alloc);
+
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from. We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
+void
+spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_magazine_t *skm;
+ unsigned long flags;
+ int do_reclaim = 0;
+ int do_emergency = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Run the destructor
+ */
+ if (skc->skc_dtor)
+ skc->skc_dtor(obj, skc->skc_private);
+
+ /*
+ * Free the object from the Linux underlying Linux slab.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ kmem_cache_free(skc->skc_linux_cache, obj);
+ return;
+ }
+
+ /*
+ * While a cache has outstanding emergency objects all freed objects
+ * must be checked. However, since emergency objects will never use
+ * a virtual address these objects can be safely excluded as an
+ * optimization.
+ */
+ if (!is_vmalloc_addr(obj)) {
+ spin_lock(&skc->skc_lock);
+ do_emergency = (skc->skc_obj_emergency > 0);
+ spin_unlock(&skc->skc_lock);
+
+ if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+ return;
+ }
+
+ local_irq_save(flags);
+
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * no remote memory allocation tracking is being performed
+ * it is entirely possible to allocate an object from one
+ * CPU cache and return it to another.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ /*
+ * Per-CPU cache full, flush it to make space for this object,
+ * this may result in an empty slab which can be reclaimed once
+ * interrupts are re-enabled.
+ */
+ if (unlikely(skm->skm_avail >= skm->skm_size)) {
+ spl_cache_flush(skc, skm, skm->skm_refill);
+ do_reclaim = 1;
+ }
+
+ /* Available space in cache, use it */
+ skm->skm_objs[skm->skm_avail++] = obj;
+
+ local_irq_restore(flags);
+
+ if (do_reclaim)
+ spl_slab_reclaim(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_free);
+
+/*
+ * The generic shrinker function for all caches. Under Linux a shrinker
+ * may not be tightly coupled with a slab cache. In fact Linux always
+ * systematically tries calling all registered shrinker callbacks which
+ * report that they contain unused objects. Because of this we only
+ * register one shrinker function in the shim layer for all slab caches.
+ * We always attempt to shrink all caches when this generic shrinker
+ * is called.
+ *
+ * If sc->nr_to_scan is zero, the caller is requesting a query of the
+ * number of objects which can potentially be freed. If it is nonzero,
+ * the request is to free that many objects.
+ *
+ * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
+ * in struct shrinker and also require the shrinker to return the number
+ * of objects freed.
+ *
+ * Older kernels require the shrinker to return the number of freeable
+ * objects following the freeing of nr_to_free.
+ *
+ * Linux semantics differ from those under Solaris, which are to
+ * free all available objects which may (and probably will) be more
+ * objects than the requested nr_to_scan.
+ */
+static spl_shrinker_t
+__spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ spl_kmem_cache_t *skc;
+ int alloc = 0;
+
+ /*
+ * No shrinking in a transaction context. Can cause deadlocks.
+ */
+ if (sc->nr_to_scan && spl_fstrans_check())
+ return (SHRINK_STOP);
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ if (sc->nr_to_scan) {
+#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
+ uint64_t oldalloc = skc->skc_obj_alloc;
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
+ if (oldalloc > skc->skc_obj_alloc)
+ alloc += oldalloc - skc->skc_obj_alloc;
+#else
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
+ alloc += skc->skc_obj_alloc;
+#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
+ } else {
+ /* Request to query number of freeable objects */
+ alloc += skc->skc_obj_alloc;
+ }
+ }
+ up_read(&spl_kmem_cache_sem);
+
+ /*
+ * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
+ * This functionality only exists to work around a rare issue where
+ * shrink_slabs() is repeatedly invoked by many cores causing the
+ * system to thrash.
+ */
+ if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
+ return (SHRINK_STOP);
+
+ return (MAX(alloc, 0));
+}
+
+SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
+
+/*
+ * Call the registered reclaim function for a cache. Depending on how
+ * many and which objects are released it may simply repopulate the
+ * local magazine which will then need to age-out. Objects which cannot
+ * fit in the magazine we will be released back to their slabs which will
+ * also need to age out before being release. This is all just best
+ * effort and we do not want to thrash creating and destroying slabs.
+ */
+void
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
+{
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ atomic_inc(&skc->skc_ref);
+
+ /*
+ * Execute the registered reclaim callback if it exists.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ if (skc->skc_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+ goto out;
+ }
+
+ /*
+ * Prevent concurrent cache reaping when contended.
+ */
+ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
+ goto out;
+
+ /*
+ * When a reclaim function is available it may be invoked repeatedly
+ * until at least a single slab can be freed. This ensures that we
+ * do free memory back to the system. This helps minimize the chance
+ * of an OOM event when the bulk of memory is used by the slab.
+ *
+ * When free slabs are already available the reclaim callback will be
+ * skipped. Additionally, if no forward progress is detected despite
+ * a reclaim function the cache will be skipped to avoid deadlock.
+ *
+ * Longer term this would be the correct place to add the code which
+ * repacks the slabs in order minimize fragmentation.
+ */
+ if (skc->skc_reclaim) {
+ uint64_t objects = UINT64_MAX;
+ int do_reclaim;
+
+ do {
+ spin_lock(&skc->skc_lock);
+ do_reclaim =
+ (skc->skc_slab_total > 0) &&
+ ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
+ (skc->skc_obj_alloc < objects);
+
+ objects = skc->skc_obj_alloc;
+ spin_unlock(&skc->skc_lock);
+
+ if (do_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+
+ } while (do_reclaim);
+ }
+
+ /* Reclaim from the magazine and free all now empty slabs. */
+ if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
+ spl_kmem_magazine_t *skm;
+ unsigned long irq_flags;
+
+ local_irq_save(irq_flags);
+ skm = skc->skc_mag[smp_processor_id()];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ local_irq_restore(irq_flags);
+ }
+
+ spl_slab_reclaim(skc);
+ clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+out:
+ atomic_dec(&skc->skc_ref);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_now);
+
+/*
+ * Reap all free slabs from all registered caches.
+ */
+void
+spl_kmem_reap(void)
+{
+ struct shrink_control sc;
+
+ sc.nr_to_scan = KMC_REAP_CHUNK;
+ sc.gfp_mask = GFP_KERNEL;
+
+ (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
+}
+EXPORT_SYMBOL(spl_kmem_reap);
+
+int
+spl_kmem_cache_init(void)
+{
+ init_rwsem(&spl_kmem_cache_sem);
+ INIT_LIST_HEAD(&spl_kmem_cache_list);
+ spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+ spl_kmem_cache_kmem_threads, maxclsyspri,
+ spl_kmem_cache_kmem_threads * 8, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ spl_register_shrinker(&spl_kmem_cache_shrinker);
+
+ return (0);
+}
+
+void
+spl_kmem_cache_fini(void)
+{
+ spl_unregister_shrinker(&spl_kmem_cache_shrinker);
+ taskq_destroy(spl_kmem_cache_taskq);
+}
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
new file mode 100644
index 000000000..e0d551041
--- /dev/null
+++ b/module/spl/spl-kmem.c
@@ -0,0 +1,567 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous. Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to sixteen pages but capped at 64K to
+ * accommodate systems using large pages. This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern. Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught. These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+ "Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages. Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+ * will quickly fail. Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+ "Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
+/* END CSTYLED */
+
+int
+kmem_debugging(void)
+{
+ return (0);
+}
+EXPORT_SYMBOL(kmem_debugging);
+
+char *
+kmem_vasprintf(const char *fmt, va_list ap)
+{
+ va_list aq;
+ char *ptr;
+
+ do {
+ va_copy(aq, ap);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
+ va_end(aq);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_vasprintf);
+
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ va_list ap;
+ char *ptr;
+
+ do {
+ va_start(ap, fmt);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
+ va_end(ap);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_asprintf);
+
+static char *
+__strdup(const char *str, int flags)
+{
+ char *ptr;
+ int n;
+
+ n = strlen(str);
+ ptr = kmalloc(n + 1, kmem_flags_convert(flags));
+ if (ptr)
+ memcpy(ptr, str, n + 1);
+
+ return (ptr);
+}
+
+char *
+strdup(const char *str)
+{
+ return (__strdup(str, KM_SLEEP));
+}
+EXPORT_SYMBOL(strdup);
+
+void
+strfree(char *str)
+{
+ kfree(str);
+}
+EXPORT_SYMBOL(strfree);
+
+/*
+ * Limit the number of large allocation stack traces dumped to not more than
+ * 5 every 60 seconds to prevent denial-of-service attacks from debug code.
+ */
+DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5);
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable. Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ int use_vmem = 0;
+ void *ptr;
+
+ /*
+ * Log abnormally large allocations and rate limit the console output.
+ * Allocations larger than spl_kmem_alloc_warn should be performed
+ * through the vmem_alloc()/vmem_zalloc() interfaces.
+ */
+ if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+ !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) {
+ printk(KERN_WARNING
+ "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+ "https://github.com/zfsonlinux/zfs/issues/new\n",
+ (unsigned long)size, flags);
+ dump_stack();
+ }
+
+ /*
+ * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+ * unlike kmem_alloc() with KM_SLEEP on Illumos.
+ */
+ do {
+ /*
+ * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+ * is unsafe. This must fail for all for kmem_alloc() and
+ * kmem_zalloc() callers.
+ *
+ * For vmem_alloc() and vmem_zalloc() callers it is permissible
+ * to use __vmalloc(). However, in general use of __vmalloc()
+ * is strongly discouraged because a global lock must be
+ * acquired. Contention on this lock can significantly
+ * impact performance so frequently manipulating the virtual
+ * address space is strongly discouraged.
+ */
+ if ((size > spl_kmem_alloc_max) || use_vmem) {
+ if (flags & KM_VMEM) {
+ ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+ } else {
+ return (NULL);
+ }
+ } else {
+ ptr = kmalloc_node(size, lflags, node);
+ }
+
+ if (likely(ptr) || (flags & KM_NOSLEEP))
+ return (ptr);
+
+ /*
+ * For vmem_alloc() and vmem_zalloc() callers retry immediately
+ * using __vmalloc() which is unlikely to fail.
+ */
+ if ((flags & KM_VMEM) && (use_vmem == 0)) {
+ use_vmem = 1;
+ continue;
+ }
+
+ if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) {
+ printk(KERN_WARNING
+ "Possible memory allocation deadlock: "
+ "size=%lu lflags=0x%x",
+ (unsigned long)size, lflags);
+ dump_stack();
+ }
+
+ /*
+ * Use cond_resched() instead of congestion_wait() to avoid
+ * deadlocking systems where there are no block devices.
+ */
+ cond_resched();
+ } while (1);
+
+ return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+ if (is_vmalloc_addr(buf))
+ vfree(buf);
+ else
+ kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations. When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
+ */
+#ifdef DEBUG_KMEM
+
+/* Shim layer memory accounting */
+#ifdef HAVE_ATOMIC64_T
+atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#else /* HAVE_ATOMIC64_T */
+atomic_t kmem_alloc_used = ATOMIC_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#endif /* HAVE_ATOMIC64_T */
+
+EXPORT_SYMBOL(kmem_alloc_used);
+EXPORT_SYMBOL(kmem_alloc_max);
+
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+ void *ptr;
+
+ ptr = spl_kmem_alloc_impl(size, flags, node);
+ if (ptr) {
+ kmem_alloc_used_add(size);
+ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+ kmem_alloc_max = kmem_alloc_used_read();
+ }
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+ kmem_alloc_used_sub(size);
+ spl_kmem_free_impl(ptr, size);
+}
+
+/*
+ * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
+ * but also the location of every alloc and free. When the SPL module is
+ * unloaded a list of all leaked addresses and where they were allocated
+ * will be dumped to the console. Enabling this feature has a significant
+ * impact on performance but it makes finding memory leaks straight forward.
+ *
+ * Not surprisingly with debugging enabled the xmem_locks are very highly
+ * contended particularly on xfree(). If we want to run with this detailed
+ * debugging enabled for anything other than debugging we need to minimize
+ * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
+ */
+#ifdef DEBUG_KMEM_TRACKING
+
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
+#define KMEM_HASH_BITS 10
+#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
+
+typedef struct kmem_debug {
+ struct hlist_node kd_hlist; /* Hash node linkage */
+ struct list_head kd_list; /* List of all allocations */
+ void *kd_addr; /* Allocation pointer */
+ size_t kd_size; /* Allocation size */
+ const char *kd_func; /* Allocation function */
+ int kd_line; /* Allocation line */
+} kmem_debug_t;
+
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
+
+static kmem_debug_t *
+kmem_del_init(spinlock_t *lock, struct hlist_head *table,
+ int bits, const void *addr)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kmem_debug *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(lock, flags);
+
+ head = &table[hash_ptr((void *)addr, bits)];
+ hlist_for_each(node, head) {
+ p = list_entry(node, struct kmem_debug, kd_hlist);
+ if (p->kd_addr == addr) {
+ hlist_del_init(&p->kd_hlist);
+ list_del_init(&p->kd_list);
+ spin_unlock_irqrestore(lock, flags);
+ return (p);
+ }
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+
+ return (NULL);
+}
+
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node)
+{
+ void *ptr = NULL;
+ kmem_debug_t *dptr;
+ unsigned long irq_flags;
+
+ dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+ if (dptr == NULL)
+ return (NULL);
+
+ dptr->kd_func = __strdup(func, flags);
+ if (dptr->kd_func == NULL) {
+ kfree(dptr);
+ return (NULL);
+ }
+
+ ptr = spl_kmem_alloc_debug(size, flags, node);
+ if (ptr == NULL) {
+ kfree(dptr->kd_func);
+ kfree(dptr);
+ return (NULL);
+ }
+
+ INIT_HLIST_NODE(&dptr->kd_hlist);
+ INIT_LIST_HEAD(&dptr->kd_list);
+
+ dptr->kd_addr = ptr;
+ dptr->kd_size = size;
+ dptr->kd_line = line;
+
+ spin_lock_irqsave(&kmem_lock, irq_flags);
+ hlist_add_head(&dptr->kd_hlist,
+ &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+ list_add_tail(&dptr->kd_list, &kmem_list);
+ spin_unlock_irqrestore(&kmem_lock, irq_flags);
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
+{
+ kmem_debug_t *dptr;
+
+ /* Ignore NULL pointer since we haven't tracked it at all */
+ if (ptr == NULL)
+ return;
+
+ /* Must exist in hash due to kmem_alloc() */
+ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
+ ASSERT3P(dptr, !=, NULL);
+ ASSERT3S(dptr->kd_size, ==, size);
+
+ kfree(dptr->kd_func);
+ kfree(dptr);
+
+ spl_kmem_free_debug(ptr, size);
+}
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
+void *
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
+
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_ZERO;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_zalloc);
+
+void
+spl_kmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_free);
+
+#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
+static char *
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+{
+ int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
+ int i, flag = 1;
+
+ ASSERT(str != NULL && len >= 17);
+ memset(str, 0, len);
+
+ /*
+ * Check for a fully printable string, and while we are at
+ * it place the printable characters in the passed buffer.
+ */
+ for (i = 0; i < size; i++) {
+ str[i] = ((char *)(kd->kd_addr))[i];
+ if (isprint(str[i])) {
+ continue;
+ } else {
+ /*
+ * Minimum number of printable characters found
+ * to make it worthwhile to print this as ascii.
+ */
+ if (i > min)
+ break;
+
+ flag = 0;
+ break;
+ }
+ }
+
+ if (!flag) {
+ sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
+ *((uint8_t *)kd->kd_addr),
+ *((uint8_t *)kd->kd_addr + 2),
+ *((uint8_t *)kd->kd_addr + 4),
+ *((uint8_t *)kd->kd_addr + 6),
+ *((uint8_t *)kd->kd_addr + 8),
+ *((uint8_t *)kd->kd_addr + 10),
+ *((uint8_t *)kd->kd_addr + 12),
+ *((uint8_t *)kd->kd_addr + 14));
+ }
+
+ return (str);
+}
+
+static int
+spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
+{
+ int i;
+
+ spin_lock_init(lock);
+ INIT_LIST_HEAD(list);
+
+ for (i = 0; i < size; i++)
+ INIT_HLIST_HEAD(&kmem_table[i]);
+
+ return (0);
+}
+
+static void
+spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+{
+ unsigned long flags;
+ kmem_debug_t *kd;
+ char str[17];
+
+ spin_lock_irqsave(lock, flags);
+ if (!list_empty(list))
+ printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
+ "size", "data", "func", "line");
+
+ list_for_each_entry(kd, list, kd_list) {
+ printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
+ (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
+ kd->kd_func, kd->kd_line);
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+}
+#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+
+int
+spl_kmem_init(void)
+{
+#ifdef DEBUG_KMEM
+ kmem_alloc_used_set(0);
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+ return (0);
+}
+
+void
+spl_kmem_fini(void)
+{
+#ifdef DEBUG_KMEM
+ /*
+ * Display all unreclaimed memory addresses, including the
+ * allocation size and the first few bytes of what's located
+ * at that address to aid in debugging. Performance is not
+ * a serious concern here since it is module unload time.
+ */
+ if (kmem_alloc_used_read() != 0)
+ printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
+ (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+}
diff --git a/module/spl/spl-kobj.c b/module/spl/spl-kobj.c
new file mode 100644
index 000000000..7019369bd
--- /dev/null
+++ b/module/spl/spl-kobj.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kobj Implementation.
+ */
+
+#include <sys/kobj.h>
+
+struct _buf *
+kobj_open_file(const char *name)
+{
+ struct _buf *file;
+ vnode_t *vp;
+ int rc;
+
+ file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP));
+ if (file == NULL)
+ return ((_buf_t *)-1UL);
+
+ if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) {
+ kfree(file);
+ return ((_buf_t *)-1UL);
+ }
+
+ file->vp = vp;
+
+ return (file);
+} /* kobj_open_file() */
+EXPORT_SYMBOL(kobj_open_file);
+
+void
+kobj_close_file(struct _buf *file)
+{
+ VOP_CLOSE(file->vp, 0, 0, 0, 0, 0);
+ kfree(file);
+} /* kobj_close_file() */
+EXPORT_SYMBOL(kobj_close_file);
+
+int
+kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
+{
+ ssize_t resid;
+
+ if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off,
+ UIO_SYSSPACE, 0, 0, 0, &resid) != 0)
+ return (-1);
+
+ return (size - resid);
+} /* kobj_read_file() */
+EXPORT_SYMBOL(kobj_read_file);
+
+int
+kobj_get_filesize(struct _buf *file, uint64_t *size)
+{
+ vattr_t vap;
+ int rc;
+
+ rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL);
+ if (rc)
+ return (rc);
+
+ *size = vap.va_size;
+
+ return (rc);
+} /* kobj_get_filesize() */
+EXPORT_SYMBOL(kobj_get_filesize);
diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c
new file mode 100644
index 000000000..bcbff94a6
--- /dev/null
+++ b/module/spl/spl-kstat.c
@@ -0,0 +1,733 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kstat Implementation.
+ */
+
+#include <linux/seq_file.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+#ifndef HAVE_PDE_DATA
+#define PDE_DATA(x) (PDE(x)->data)
+#endif
+
+static kmutex_t kstat_module_lock;
+static struct list_head kstat_module_list;
+static kid_t kstat_id;
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+ return (ENOMEM);
+
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+
+ return (0);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt++;
+ if (wcnt != 0) {
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_waitq_enter);
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt--;
+ ASSERT((int)wcnt > 0);
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+}
+EXPORT_SYMBOL(kstat_waitq_exit);
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt++;
+ if (rcnt != 0) {
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_runq_enter);
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt--;
+ ASSERT((int)rcnt > 0);
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+}
+EXPORT_SYMBOL(kstat_runq_exit);
+
+static int
+kstat_seq_show_headers(struct seq_file *f)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
+ ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
+ ksp->ks_ndata, (int)ksp->ks_data_size,
+ ksp->ks_crtime, ksp->ks_snaptime);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.headers) {
+ rc = ksp->ks_raw_ops.headers(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ seq_printf(f, "raw data\n");
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ seq_printf(f, "%-31s %-4s %s\n",
+ "name", "type", "data");
+ break;
+ case KSTAT_TYPE_INTR:
+ seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
+ "hard", "soft", "watchdog",
+ "spurious", "multsvc");
+ break;
+ case KSTAT_TYPE_IO:
+ seq_printf(f,
+ "%-8s %-8s %-8s %-8s %-8s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s %-8s\n",
+ "nread", "nwritten", "reads", "writes",
+ "wtime", "wlentime", "wupdate",
+ "rtime", "rlentime", "rupdate",
+ "wcnt", "rcnt");
+ break;
+ case KSTAT_TYPE_TIMER:
+ seq_printf(f,
+ "%-31s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s\n",
+ "name", "events", "elapsed",
+ "min", "max", "start", "stop");
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
+{
+ int i, j;
+
+ for (i = 0; ; i++) {
+ seq_printf(f, "%03x:", i);
+
+ for (j = 0; j < 16; j++) {
+ if (i * 16 + j >= l) {
+ seq_printf(f, "\n");
+ goto out;
+ }
+
+ seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
+ }
+ seq_printf(f, "\n");
+ }
+out:
+ return (0);
+}
+
+static int
+kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
+{
+ seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
+
+ switch (knp->data_type) {
+ case KSTAT_DATA_CHAR:
+ knp->value.c[15] = '\0'; /* NULL terminate */
+ seq_printf(f, "%-16s", knp->value.c);
+ break;
+ /*
+ * NOTE - We need to be more careful able what tokens are
+ * used for each arch, for now this is correct for x86_64.
+ */
+ case KSTAT_DATA_INT32:
+ seq_printf(f, "%d", knp->value.i32);
+ break;
+ case KSTAT_DATA_UINT32:
+ seq_printf(f, "%u", knp->value.ui32);
+ break;
+ case KSTAT_DATA_INT64:
+ seq_printf(f, "%lld", (signed long long)knp->value.i64);
+ break;
+ case KSTAT_DATA_UINT64:
+ seq_printf(f, "%llu",
+ (unsigned long long)knp->value.ui64);
+ break;
+ case KSTAT_DATA_LONG:
+ seq_printf(f, "%ld", knp->value.l);
+ break;
+ case KSTAT_DATA_ULONG:
+ seq_printf(f, "%lu", knp->value.ul);
+ break;
+ case KSTAT_DATA_STRING:
+ KSTAT_NAMED_STR_PTR(knp)
+ [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
+ seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
+ break;
+ default:
+ PANIC("Undefined kstat data type %d\n", knp->data_type);
+ }
+
+ seq_printf(f, "\n");
+
+ return (0);
+}
+
+static int
+kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
+{
+ seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
+ kip->intrs[KSTAT_INTR_HARD],
+ kip->intrs[KSTAT_INTR_SOFT],
+ kip->intrs[KSTAT_INTR_WATCHDOG],
+ kip->intrs[KSTAT_INTR_SPURIOUS],
+ kip->intrs[KSTAT_INTR_MULTSVC]);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
+{
+ seq_printf(f,
+ "%-8llu %-8llu %-8u %-8u %-8lld %-8lld "
+ "%-8lld %-8lld %-8lld %-8lld %-8u %-8u\n",
+ kip->nread, kip->nwritten,
+ kip->reads, kip->writes,
+ kip->wtime, kip->wlentime, kip->wlastupdate,
+ kip->rtime, kip->rlentime, kip->rlastupdate,
+ kip->wcnt, kip->rcnt);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
+{
+ seq_printf(f,
+ "%-31s %-8llu %-8lld %-8lld %-8lld %-8lld %-8lld\n",
+ ktp->name, ktp->num_events, ktp->elapsed_time,
+ ktp->min_time, ktp->max_time,
+ ktp->start_time, ktp->stop_time);
+
+ return (0);
+}
+
+static int
+kstat_seq_show(struct seq_file *f, void *p)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.data) {
+ rc = ksp->ks_raw_ops.data(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ ASSERT(ksp->ks_ndata == 1);
+ rc = kstat_seq_show_raw(f, ksp->ks_data,
+ ksp->ks_data_size);
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = kstat_seq_show_named(f, (kstat_named_t *)p);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = kstat_seq_show_io(f, (kstat_io_t *)p);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+ ASSERT(ksp != NULL);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+static void *
+kstat_seq_data_addr(kstat_t *ksp, loff_t n)
+{
+ void *rc = NULL;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ if (ksp->ks_raw_ops.addr)
+ rc = ksp->ks_raw_ops.addr(ksp, n);
+ else
+ rc = ksp->ks_data;
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = ksp->ks_data + n * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = ksp->ks_data + n * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = ksp->ks_data + n * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = ksp->ks_data + n * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (rc);
+}
+
+static void *
+kstat_seq_start(struct seq_file *f, loff_t *pos)
+{
+ loff_t n = *pos;
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW) {
+ ksp->ks_raw_bufsize = PAGE_SIZE;
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+ }
+
+ /* Dynamically update kstat, on error existing kstats are used */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+
+ ksp->ks_snaptime = gethrtime();
+
+ if (!n && kstat_seq_show_headers(f))
+ return (NULL);
+
+ if (n >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, n));
+}
+
+static void *
+kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ ++*pos;
+ if (*pos >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, *pos));
+}
+
+static void
+kstat_seq_stop(struct seq_file *f, void *v)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW)
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+
+ mutex_exit(ksp->ks_lock);
+}
+
+static struct seq_operations kstat_seq_ops = {
+ .show = kstat_seq_show,
+ .start = kstat_seq_start,
+ .next = kstat_seq_next,
+ .stop = kstat_seq_stop,
+};
+
+static kstat_module_t *
+kstat_find_module(char *name)
+{
+ kstat_module_t *module;
+
+ list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
+ if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
+ return (module);
+ }
+
+ return (NULL);
+}
+
+static kstat_module_t *
+kstat_create_module(char *name)
+{
+ kstat_module_t *module;
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(name, proc_spl_kstat);
+ if (pde == NULL)
+ return (NULL);
+
+ module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
+ module->ksm_proc = pde;
+ strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+ INIT_LIST_HEAD(&module->ksm_kstat_list);
+ list_add_tail(&module->ksm_module_list, &kstat_module_list);
+
+ return (module);
+
+}
+
+static void
+kstat_delete_module(kstat_module_t *module)
+{
+ ASSERT(list_empty(&module->ksm_kstat_list));
+ remove_proc_entry(module->ksm_name, proc_spl_kstat);
+ list_del(&module->ksm_module_list);
+ kmem_free(module, sizeof (kstat_module_t));
+}
+
+static int
+proc_kstat_open(struct inode *inode, struct file *filp)
+{
+ struct seq_file *f;
+ int rc;
+
+ rc = seq_open(filp, &kstat_seq_ops);
+ if (rc)
+ return (rc);
+
+ f = filp->private_data;
+ f->private = PDE_DATA(inode);
+
+ return (rc);
+}
+
+static ssize_t
+proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct seq_file *f = filp->private_data;
+ kstat_t *ksp = f->private;
+ int rc;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+ rc = ksp->ks_update(ksp, KSTAT_WRITE);
+ mutex_exit(ksp->ks_lock);
+
+ if (rc)
+ return (-rc);
+
+ *ppos += len;
+ return (len);
+}
+
+static struct file_operations proc_kstat_operations = {
+ .open = proc_kstat_open,
+ .write = proc_kstat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+EXPORT_SYMBOL(__kstat_set_raw_ops);
+
+kstat_t *
+__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
+ const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
+ uchar_t ks_flags)
+{
+ kstat_t *ksp;
+
+ ASSERT(ks_module);
+ ASSERT(ks_instance == 0);
+ ASSERT(ks_name);
+ ASSERT(!(ks_flags & KSTAT_FLAG_UNSUPPORTED));
+
+ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+ ASSERT(ks_ndata == 1);
+
+ ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
+ if (ksp == NULL)
+ return (ksp);
+
+ mutex_enter(&kstat_module_lock);
+ ksp->ks_kid = kstat_id;
+ kstat_id++;
+ mutex_exit(&kstat_module_lock);
+
+ ksp->ks_magic = KS_MAGIC;
+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+ ksp->ks_lock = &ksp->ks_private_lock;
+ INIT_LIST_HEAD(&ksp->ks_list);
+
+ ksp->ks_crtime = gethrtime();
+ ksp->ks_snaptime = ksp->ks_crtime;
+ strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN);
+ ksp->ks_instance = ks_instance;
+ strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN);
+ strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+ ksp->ks_type = ks_type;
+ ksp->ks_flags = ks_flags;
+ ksp->ks_update = kstat_default_update;
+ ksp->ks_private = NULL;
+ ksp->ks_raw_ops.headers = NULL;
+ ksp->ks_raw_ops.data = NULL;
+ ksp->ks_raw_ops.addr = NULL;
+ ksp->ks_raw_buf = NULL;
+ ksp->ks_raw_bufsize = 0;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ ksp->ks_ndata = 1;
+ ksp->ks_data_size = ks_ndata;
+ break;
+ case KSTAT_TYPE_NAMED:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ ksp->ks_data = NULL;
+ } else {
+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+ if (ksp->ks_data == NULL) {
+ kmem_free(ksp, sizeof (*ksp));
+ ksp = NULL;
+ }
+ }
+
+ return (ksp);
+}
+EXPORT_SYMBOL(__kstat_create);
+
+static int
+kstat_detect_collision(kstat_t *ksp)
+{
+ kstat_module_t *module;
+ kstat_t *tmp;
+ char *parent;
+ char *cp;
+
+ parent = kmem_asprintf("%s", ksp->ks_module);
+
+ if ((cp = strrchr(parent, '/')) == NULL) {
+ strfree(parent);
+ return (0);
+ }
+
+ cp[0] = '\0';
+ if ((module = kstat_find_module(parent)) != NULL) {
+ list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) {
+ if (strncmp(tmp->ks_name, cp+1, KSTAT_STRLEN) == 0) {
+ strfree(parent);
+ return (EEXIST);
+ }
+ }
+ }
+
+ strfree(parent);
+ return (0);
+}
+
+void
+__kstat_install(kstat_t *ksp)
+{
+ kstat_module_t *module;
+ kstat_t *tmp;
+
+ ASSERT(ksp);
+
+ mutex_enter(&kstat_module_lock);
+
+ module = kstat_find_module(ksp->ks_module);
+ if (module == NULL) {
+ if (kstat_detect_collision(ksp) != 0) {
+ cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
+ " collision", ksp->ks_module, ksp->ks_name);
+ goto out;
+ }
+ module = kstat_create_module(ksp->ks_module);
+ if (module == NULL)
+ goto out;
+ }
+
+ /*
+ * Only one entry by this name per-module, on failure the module
+ * shouldn't be deleted because we know it has at least one entry.
+ */
+ list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) {
+ if (strncmp(tmp->ks_name, ksp->ks_name, KSTAT_STRLEN) == 0)
+ goto out;
+ }
+
+ list_add_tail(&ksp->ks_list, &module->ksm_kstat_list);
+
+ mutex_enter(ksp->ks_lock);
+ ksp->ks_owner = module;
+ ksp->ks_proc = proc_create_data(ksp->ks_name, 0644,
+ module->ksm_proc, &proc_kstat_operations, (void *)ksp);
+ if (ksp->ks_proc == NULL) {
+ list_del_init(&ksp->ks_list);
+ if (list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ }
+ mutex_exit(ksp->ks_lock);
+out:
+ mutex_exit(&kstat_module_lock);
+}
+EXPORT_SYMBOL(__kstat_install);
+
+void
+__kstat_delete(kstat_t *ksp)
+{
+ kstat_module_t *module = ksp->ks_owner;
+
+ mutex_enter(&kstat_module_lock);
+ list_del_init(&ksp->ks_list);
+ mutex_exit(&kstat_module_lock);
+
+ if (ksp->ks_proc) {
+ remove_proc_entry(ksp->ks_name, module->ksm_proc);
+
+ /* Remove top level module directory if it's empty */
+ if (list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ }
+
+ if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+ kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+ ksp->ks_lock = NULL;
+ mutex_destroy(&ksp->ks_private_lock);
+ kmem_free(ksp, sizeof (*ksp));
+}
+EXPORT_SYMBOL(__kstat_delete);
+
+int
+spl_kstat_init(void)
+{
+ mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&kstat_module_list);
+ kstat_id = 0;
+ return (0);
+}
+
+void
+spl_kstat_fini(void)
+{
+ ASSERT(list_empty(&kstat_module_list));
+ mutex_destroy(&kstat_module_lock);
+}
diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c
new file mode 100644
index 000000000..ba818862b
--- /dev/null
+++ b/module/spl/spl-mutex.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Mutex Implementation.
+ */
+
+#include <sys/mutex.h>
+
+int spl_mutex_init(void) { return 0; }
+void spl_mutex_fini(void) { }
diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c
new file mode 100644
index 000000000..9c52924a4
--- /dev/null
+++ b/module/spl/spl-proc.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Proc Implementation.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/proc.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+
+#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+typedef struct ctl_table __no_const spl_ctl_table;
+#else
+typedef struct ctl_table spl_ctl_table;
+#endif
+
+static unsigned long table_min = 0;
+static unsigned long table_max = ~0;
+
+static struct ctl_table_header *spl_header = NULL;
+static struct proc_dir_entry *proc_spl = NULL;
+static struct proc_dir_entry *proc_spl_kmem = NULL;
+static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
+static struct proc_dir_entry *proc_spl_taskq_all = NULL;
+static struct proc_dir_entry *proc_spl_taskq = NULL;
+struct proc_dir_entry *proc_spl_kstat = NULL;
+
+static int
+proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer,
+ int ubuffer_size)
+{
+ int size;
+
+ if (ubuffer_size > kbuffer_size)
+ return (-EOVERFLOW);
+
+ if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size))
+ return (-EFAULT);
+
+ /* strip trailing whitespace */
+ size = strnlen(kbuffer, ubuffer_size);
+ while (size-- >= 0)
+ if (!isspace(kbuffer[size]))
+ break;
+
+ /* empty string */
+ if (size < 0)
+ return (-EINVAL);
+
+ /* no space to terminate */
+ if (size == kbuffer_size)
+ return (-EOVERFLOW);
+
+ kbuffer[size + 1] = 0;
+ return (0);
+}
+
+static int
+proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer,
+ char *append)
+{
+ /*
+ * NB if 'append' != NULL, it's a single character to append to the
+ * copied out string - usually "\n", for /proc entries and
+ * (i.e. a terminating zero byte) for sysctl entries
+ */
+ int size = MIN(strlen(kbuffer), ubuffer_size);
+
+ if (copy_to_user(ubuffer, kbuffer, size))
+ return (-EFAULT);
+
+ if (append != NULL && size < ubuffer_size) {
+ if (copy_to_user(ubuffer + size, append, 1))
+ return (-EFAULT);
+
+ size++;
+ }
+
+ return (size);
+}
+
+#ifdef DEBUG_KMEM
+static int
+proc_domemused(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val;
+ spl_ctl_table dummy = *table;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+#ifdef HAVE_ATOMIC64_T
+ val = atomic64_read((atomic64_t *)table->data);
+#else
+ val = atomic_read((atomic_t *)table->data);
+#endif /* HAVE_ATOMIC64_T */
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+#endif /* DEBUG_KMEM */
+
+static int
+proc_doslab(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val = 0, mask;
+ spl_ctl_table dummy = *table;
+ spl_kmem_cache_t *skc;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+ down_read(&spl_kmem_cache_sem);
+ mask = (unsigned long)table->data;
+
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+
+ /* Only use slabs of the correct kmem/vmem type */
+ if (!(skc->skc_flags & mask))
+ continue;
+
+ /* Sum the specified field for selected slabs */
+ switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
+ case KMC_TOTAL:
+ val += skc->skc_slab_size * skc->skc_slab_total;
+ break;
+ case KMC_ALLOC:
+ val += skc->skc_obj_size * skc->skc_obj_alloc;
+ break;
+ case KMC_MAX:
+ val += skc->skc_obj_size * skc->skc_obj_max;
+ break;
+ }
+ }
+
+ up_read(&spl_kmem_cache_sem);
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+
+static int
+proc_dohostid(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int len, rc = 0;
+ char *end, str[32];
+
+ if (write) {
+ /*
+ * We can't use proc_doulongvec_minmax() in the write
+ * case here because hostid while a hex value has no
+ * leading 0x which confuses the helper function.
+ */
+ rc = proc_copyin_string(str, sizeof (str), buffer, *lenp);
+ if (rc < 0)
+ return (rc);
+
+ spl_hostid = simple_strtoul(str, &end, 16);
+ if (str == end)
+ return (-EINVAL);
+
+ } else {
+ len = snprintf(str, sizeof (str), "%lx",
+ (unsigned long) zone_get_hostid(NULL));
+ if (*ppos >= len)
+ rc = 0;
+ else
+ rc = proc_copyout_string(buffer,
+ *lenp, str + *ppos, "\n");
+
+ if (rc >= 0) {
+ *lenp = rc;
+ *ppos += rc;
+ }
+ }
+
+ return (rc);
+}
+
+static void
+taskq_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
+ "taskq", "act", "nthr", "spwn", "maxt", "pri",
+ "mina", "maxa", "cura", "flags");
+}
+
+/* indices into the lheads array below */
+#define LHEAD_PEND 0
+#define LHEAD_PRIO 1
+#define LHEAD_DELAY 2
+#define LHEAD_WAIT 3
+#define LHEAD_ACTIVE 4
+#define LHEAD_SIZE 5
+
+/* BEGIN CSTYLED */
+static unsigned int spl_max_show_tasks = 512;
+module_param(spl_max_show_tasks, uint, 0644);
+MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
+/* END CSTYLED */
+
+static int
+taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
+{
+ taskq_t *tq = p;
+ taskq_thread_t *tqt;
+ spl_wait_queue_entry_t *wq;
+ struct task_struct *tsk;
+ taskq_ent_t *tqe;
+ char name[100];
+ struct list_head *lheads[LHEAD_SIZE], *lh;
+ static char *list_names[LHEAD_SIZE] =
+ {"pend", "prio", "delay", "wait", "active" };
+ int i, j, have_lheads = 0;
+ unsigned long wflags, flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
+
+ /* get the various lists and check whether they're empty */
+ lheads[LHEAD_PEND] = &tq->tq_pend_list;
+ lheads[LHEAD_PRIO] = &tq->tq_prio_list;
+ lheads[LHEAD_DELAY] = &tq->tq_delay_list;
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
+#else
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
+#endif
+ lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
+
+ for (i = 0; i < LHEAD_SIZE; ++i) {
+ if (list_empty(lheads[i]))
+ lheads[i] = NULL;
+ else
+ ++have_lheads;
+ }
+
+ /* early return in non-"all" mode if lists are all empty */
+ if (!allflag && !have_lheads) {
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (0);
+ }
+
+ /* unlock the waitq quickly */
+ if (!lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+
+ /* show the base taskq contents */
+ snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
+ seq_printf(f, "%-25s ", name);
+ seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
+ tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
+ tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
+ tq->tq_nalloc, tq->tq_flags);
+
+ /* show the active list */
+ if (lheads[LHEAD_ACTIVE]) {
+ j = 0;
+ list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[LHEAD_ACTIVE]);
+ else if (j == 2) {
+ seq_printf(f, "\n\t ");
+ j = 0;
+ }
+ seq_printf(f, " [%d]%pf(%ps)",
+ tqt->tqt_thread->pid,
+ tqt->tqt_task->tqent_func,
+ tqt->tqt_task->tqent_arg);
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+
+ for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
+ if (lheads[i]) {
+ j = 0;
+ list_for_each(lh, lheads[i]) {
+ if (spl_max_show_tasks != 0 &&
+ j >= spl_max_show_tasks) {
+ seq_printf(f, "\n\t(truncated)");
+ break;
+ }
+ /* show the wait waitq list */
+ if (i == LHEAD_WAIT) {
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, entry);
+#else
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, task_list);
+#endif
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 8 == 0)
+ seq_printf(f, "\n\t ");
+
+ tsk = wq->private;
+ seq_printf(f, " %d", tsk->pid);
+ /* pend, prio and delay lists */
+ } else {
+ tqe = list_entry(lh, taskq_ent_t,
+ tqent_list);
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 2 == 0)
+ seq_printf(f, "\n\t ");
+
+ seq_printf(f, " %pf(%ps)",
+ tqe->tqent_func,
+ tqe->tqent_arg);
+ }
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+ if (lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (0);
+}
+
+static int
+taskq_all_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_TRUE));
+}
+
+static int
+taskq_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_FALSE));
+}
+
+static void *
+taskq_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&tq_list_sem);
+ if (!n)
+ taskq_seq_show_headers(f);
+
+ p = tq_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &tq_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, taskq_t, tq_taskqs));
+}
+
+static void *
+taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ taskq_t *tq = p;
+
+ ++*pos;
+ return ((tq->tq_taskqs.next == &tq_list) ?
+ NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
+}
+
+static void
+slab_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f,
+ "--------------------- cache ----------"
+ "--------------------------------------------- "
+ "----- slab ------ "
+ "---- object ----- "
+ "--- emergency ---\n");
+ seq_printf(f,
+ "name "
+ " flags size alloc slabsize objsize "
+ "total alloc max "
+ "total alloc max "
+ "dlock alloc max\n");
+}
+
+static int
+slab_seq_show(struct seq_file *f, void *p)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Backed by Linux slab see /proc/slabinfo.
+ */
+ if (skc->skc_flags & KMC_SLAB)
+ return (0);
+
+ spin_lock(&skc->skc_lock);
+ seq_printf(f, "%-36s ", skc->skc_name);
+ seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
+ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
+ (long unsigned)skc->skc_flags,
+ (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
+ (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
+ (unsigned)skc->skc_slab_size,
+ (unsigned)skc->skc_obj_size,
+ (long unsigned)skc->skc_slab_total,
+ (long unsigned)skc->skc_slab_alloc,
+ (long unsigned)skc->skc_slab_max,
+ (long unsigned)skc->skc_obj_total,
+ (long unsigned)skc->skc_obj_alloc,
+ (long unsigned)skc->skc_obj_max,
+ (long unsigned)skc->skc_obj_deadlock,
+ (long unsigned)skc->skc_obj_emergency,
+ (long unsigned)skc->skc_obj_emergency_max);
+
+ spin_unlock(&skc->skc_lock);
+
+ return (0);
+}
+
+static void *
+slab_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&spl_kmem_cache_sem);
+ if (!n)
+ slab_seq_show_headers(f);
+
+ p = spl_kmem_cache_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &spl_kmem_cache_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, spl_kmem_cache_t, skc_list));
+}
+
+static void *
+slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ++*pos;
+ return ((skc->skc_list.next == &spl_kmem_cache_list) ?
+ NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
+}
+
+static void
+slab_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&spl_kmem_cache_sem);
+}
+
+static struct seq_operations slab_seq_ops = {
+ .show = slab_seq_show,
+ .start = slab_seq_start,
+ .next = slab_seq_next,
+ .stop = slab_seq_stop,
+};
+
+static int
+proc_slab_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &slab_seq_ops));
+}
+
+static struct file_operations proc_slab_operations = {
+ .open = proc_slab_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void
+taskq_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&tq_list_sem);
+}
+
+static struct seq_operations taskq_all_seq_ops = {
+ .show = taskq_all_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static struct seq_operations taskq_seq_ops = {
+ .show = taskq_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static int
+proc_taskq_all_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_all_seq_ops));
+}
+
+static int
+proc_taskq_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_seq_ops));
+}
+
+static struct file_operations proc_taskq_all_operations = {
+ .open = proc_taskq_all_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct file_operations proc_taskq_operations = {
+ .open = proc_taskq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
+ {
+ .procname = "kmem_used",
+ .data = &kmem_alloc_used,
+#ifdef HAVE_ATOMIC64_T
+ .maxlen = sizeof (atomic64_t),
+#else
+ .maxlen = sizeof (atomic_t),
+#endif /* HAVE_ATOMIC64_T */
+ .mode = 0444,
+ .proc_handler = &proc_domemused,
+ },
+ {
+ .procname = "kmem_max",
+ .data = &kmem_alloc_max,
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif /* DEBUG_KMEM */
+ {
+ .procname = "slab_kmem_total",
+ .data = (void *)(KMC_KMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kmem_alloc",
+ .data = (void *)(KMC_KMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kmem_max",
+ .data = (void *)(KMC_KMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_total",
+ .data = (void *)(KMC_VMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_alloc",
+ .data = (void *)(KMC_VMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_max",
+ .data = (void *)(KMC_VMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {},
+};
+
+static struct ctl_table spl_kstat_table[] = {
+ {},
+};
+
+static struct ctl_table spl_table[] = {
+ /*
+ * NB No .strategy entries have been provided since
+ * sysctl(8) prefers to go via /proc for portability.
+ */
+ {
+ .procname = "version",
+ .data = spl_version,
+ .maxlen = sizeof (spl_version),
+ .mode = 0444,
+ .proc_handler = &proc_dostring,
+ },
+ {
+ .procname = "hostid",
+ .data = &spl_hostid,
+ .maxlen = sizeof (unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_dohostid,
+ },
+ {
+ .procname = "kmem",
+ .mode = 0555,
+ .child = spl_kmem_table,
+ },
+ {
+ .procname = "kstat",
+ .mode = 0555,
+ .child = spl_kstat_table,
+ },
+ {},
+};
+
+static struct ctl_table spl_dir[] = {
+ {
+ .procname = "spl",
+ .mode = 0555,
+ .child = spl_table,
+ },
+ {}
+};
+
+static struct ctl_table spl_root[] = {
+ {
+#ifdef HAVE_CTL_NAME
+ .ctl_name = CTL_KERN,
+#endif
+ .procname = "kernel",
+ .mode = 0555,
+ .child = spl_dir,
+ },
+ {}
+};
+
+int
+spl_proc_init(void)
+{
+ int rc = 0;
+
+ spl_header = register_sysctl_table(spl_root);
+ if (spl_header == NULL)
+ return (-EUNATCH);
+
+ proc_spl = proc_mkdir("spl", NULL);
+ if (proc_spl == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
+ &proc_taskq_all_operations, NULL);
+ if (proc_spl_taskq_all == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
+ &proc_taskq_operations, NULL);
+ if (proc_spl_taskq == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem = proc_mkdir("kmem", proc_spl);
+ if (proc_spl_kmem == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
+ &proc_slab_operations, NULL);
+ if (proc_spl_kmem_slab == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kstat = proc_mkdir("kstat", proc_spl);
+ if (proc_spl_kstat == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+out:
+ if (rc) {
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+ unregister_sysctl_table(spl_header);
+ }
+
+ return (rc);
+}
+
+void
+spl_proc_fini(void)
+{
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+
+ ASSERT(spl_header != NULL);
+ unregister_sysctl_table(spl_header);
+}
diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c
new file mode 100644
index 000000000..9a992cc3a
--- /dev/null
+++ b/module/spl/spl-rwlock.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation.
+ */
+
+#include <sys/rwlock.h>
+
+#if defined(CONFIG_PREEMPT_RT_FULL)
+
+#include <linux/rtmutex.h>
+#define RT_MUTEX_OWNER_MASKALL 1UL
+
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+
+ ASSERT((struct task_struct *)
+ ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) ==
+ current);
+
+ /*
+ * Under the realtime patch series, rwsem is implemented as a
+ * single mutex held by readers and writers alike. However,
+ * this implementation would prevent a thread from taking a
+ * read lock twice, as the mutex would already be locked on
+ * the second attempt. Therefore the implementation allows a
+ * single thread to take a rwsem as read lock multiple times
+ * tracking that nesting as read_depth counter.
+ */
+ if (rwsem->read_depth <= 1) {
+ /*
+ * In case, the current thread has not taken the lock
+ * more than once as read lock, we can allow an
+ * upgrade to a write lock. rwsem_rt.h implements
+ * write locks as read_depth == 0.
+ */
+ rwsem->read_depth = 0;
+ return (1);
+ }
+ return (0);
+}
+#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ int ret = 0;
+ unsigned long flags;
+ spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags);
+ if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE &&
+ list_empty(&rwsem->wait_list)) {
+ ret = 1;
+ RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE;
+ }
+ spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);
+ return (ret);
+}
+#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ long val;
+ val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
+ SPL_RWSEM_SINGLE_WRITER_VALUE);
+ return (val == SPL_RWSEM_SINGLE_READER_VALUE);
+}
+#else
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ typeof(rwsem->count) val;
+ val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
+ SPL_RWSEM_SINGLE_WRITER_VALUE);
+ return (val == SPL_RWSEM_SINGLE_READER_VALUE);
+}
+#endif
+
+int
+rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ if (__rwsem_tryupgrade(rwsem)) {
+ rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+ rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ rwsem->owner = current;
+#endif
+ return (1);
+ }
+ return (0);
+}
+EXPORT_SYMBOL(rwsem_tryupgrade);
+
+int spl_rw_init(void) { return 0; }
+void spl_rw_fini(void) { }
diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
new file mode 100644
index 000000000..2919a942a
--- /dev/null
+++ b/module/spl/spl-taskq.c
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Task Queue Implementation.
+ */
+
+#include <sys/taskq.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+int spl_taskq_thread_bind = 0;
+module_param(spl_taskq_thread_bind, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+
+
+int spl_taskq_thread_dynamic = 1;
+module_param(spl_taskq_thread_dynamic, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+ "Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+ "Create new taskq threads after N sequential tasks");
+
+/* Global system-wide dynamic task queue available for all consumers */
+taskq_t *system_taskq;
+EXPORT_SYMBOL(system_taskq);
+/* Global dynamic task queue for long delay */
+taskq_t *system_delay_taskq;
+EXPORT_SYMBOL(system_delay_taskq);
+
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
+/* List of all taskqs */
+LIST_HEAD(tq_list);
+DECLARE_RWSEM(tq_list_sem);
+static uint_t taskq_tsd;
+
+static int
+task_km_flags(uint_t flags)
+{
+ if (flags & TQ_NOSLEEP)
+ return (KM_NOSLEEP);
+
+ if (flags & TQ_PUSHPAGE)
+ return (KM_PUSHPAGE);
+
+ return (KM_SLEEP);
+}
+
+/*
+ * taskq_find_by_name - Find the largest instance number of a named taskq.
+ */
+static int
+taskq_find_by_name(const char *name)
+{
+ struct list_head *tql;
+ taskq_t *tq;
+
+ list_for_each_prev(tql, &tq_list) {
+ tq = list_entry(tql, taskq_t, tq_taskqs);
+ if (strcmp(name, tq->tq_name) == 0)
+ return (tq->tq_instance);
+ }
+ return (-1);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, returns a list_t which
+ * is not attached to the free, work, or pending taskq lists.
+ */
+static taskq_ent_t *
+task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
+{
+ taskq_ent_t *t;
+ int count = 0;
+
+ ASSERT(tq);
+retry:
+ /* Acquire taskq_ent_t's from free list if available */
+ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ list_del_init(&t->tqent_list);
+ return (t);
+ }
+
+ /* Free list is empty and memory allocations are prohibited */
+ if (flags & TQ_NOALLOC)
+ return (NULL);
+
+ /* Hit maximum taskq_ent_t pool size */
+ if (tq->tq_nalloc >= tq->tq_maxalloc) {
+ if (flags & TQ_NOSLEEP)
+ return (NULL);
+
+ /*
+ * Sleep periodically polling the free list for an available
+ * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+ * but we cannot block forever waiting for an taskq_ent_t to
+ * show up in the free list, otherwise a deadlock can happen.
+ *
+ * Therefore, we need to allocate a new task even if the number
+ * of allocated tasks is above tq->tq_maxalloc, but we still
+ * end up delaying the task allocation by one second, thereby
+ * throttling the task dispatch rate.
+ */
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ schedule_timeout(HZ / 100);
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
+ tq->tq_lock_class);
+ if (count < 100) {
+ count++;
+ goto retry;
+ }
+ }
+
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
+
+ if (t) {
+ taskq_init_ent(t);
+ tq->tq_nalloc++;
+ }
+
+ return (t);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
+ * to already be removed from the free, work, or pending taskq lists.
+ */
+static void
+task_free(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+ ASSERT(list_empty(&t->tqent_list));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ kmem_free(t, sizeof (taskq_ent_t));
+ tq->tq_nalloc--;
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, either destroys the
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
+ */
+static void
+task_done(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+
+ /* Wake tasks blocked in taskq_wait_id() */
+ wake_up_all(&t->tqent_waitq);
+
+ list_del_init(&t->tqent_list);
+
+ if (tq->tq_nalloc <= tq->tq_minalloc) {
+ t->tqent_id = TASKQID_INVALID;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+
+ list_add_tail(&t->tqent_list, &tq->tq_free_list);
+ } else {
+ task_free(tq, t);
+ }
+}
+
+/*
+ * When a delayed task timer expires remove it from the delay list and
+ * add it to the priority list in order for immediate processing.
+ */
+static void
+task_expire_impl(taskq_ent_t *t)
+{
+ taskq_ent_t *w;
+ taskq_t *tq = t->tqent_taskq;
+ struct list_head *l;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (t->tqent_flags & TQENT_FLAG_CANCEL) {
+ ASSERT(list_empty(&t->tqent_list));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return;
+ }
+
+ t->tqent_birth = jiffies;
+ /*
+ * The priority list must be maintained in strict task id order
+ * from lowest to highest for lowest_id to be easily calculable.
+ */
+ list_del(&t->tqent_list);
+ list_for_each_prev(l, &tq->tq_prio_list) {
+ w = list_entry(l, taskq_ent_t, tqent_list);
+ if (w->tqent_id < t->tqent_id) {
+ list_add(&t->tqent_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_prio_list)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ wake_up(&tq->tq_work_waitq);
+}
+
+#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+static void
+task_expire(struct timer_list *tl)
+{
+ taskq_ent_t *t = from_timer(t, tl, tqent_timer);
+ task_expire_impl(t);
+}
+#else
+static void
+task_expire(unsigned long data)
+{
+ task_expire_impl((taskq_ent_t *)data);
+}
+#endif
+
+/*
+ * Returns the lowest incomplete taskqid_t. The taskqid_t may
+ * be queued on the pending list, on the priority list, on the
+ * delay list, or on the work list currently being handled, but
+ * it is not 100% complete yet.
+ */
+static taskqid_t
+taskq_lowest_id(taskq_t *tq)
+{
+ taskqid_t lowest_id = tq->tq_next_id;
+ taskq_ent_t *t;
+ taskq_thread_t *tqt;
+
+ ASSERT(tq);
+
+ if (!list_empty(&tq->tq_pend_list)) {
+ t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_prio_list)) {
+ t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_delay_list)) {
+ t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_active_list)) {
+ tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+ tqt_active_list);
+ ASSERT(tqt->tqt_id != TASKQID_INVALID);
+ lowest_id = MIN(lowest_id, tqt->tqt_id);
+ }
+
+ return (lowest_id);
+}
+
+/*
+ * Insert a task into a list keeping the list sorted by increasing taskqid.
+ */
+static void
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
+{
+ taskq_thread_t *w;
+ struct list_head *l;
+
+ ASSERT(tq);
+ ASSERT(tqt);
+
+ list_for_each_prev(l, &tq->tq_active_list) {
+ w = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (w->tqt_id < tqt->tqt_id) {
+ list_add(&tqt->tqt_active_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_active_list)
+ list_add(&tqt->tqt_active_list, &tq->tq_active_list);
+}
+
+/*
+ * Find and return a task from the given list if it exists. The list
+ * must be in lowest to highest task id order.
+ */
+static taskq_ent_t *
+taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
+{
+ struct list_head *l;
+ taskq_ent_t *t;
+
+ list_for_each(l, lh) {
+ t = list_entry(l, taskq_ent_t, tqent_list);
+
+ if (t->tqent_id == id)
+ return (t);
+
+ if (t->tqent_id > id)
+ break;
+ }
+
+ return (NULL);
+}
+
+/*
+ * Find an already dispatched task given the task id regardless of what
+ * state it is in. If a task is still pending it will be returned.
+ * If a task is executing, then -EBUSY will be returned instead.
+ * If the task has already been run then NULL is returned.
+ */
+static taskq_ent_t *
+taskq_find(taskq_t *tq, taskqid_t id)
+{
+ taskq_thread_t *tqt;
+ struct list_head *l;
+ taskq_ent_t *t;
+
+ t = taskq_find_list(tq, &tq->tq_delay_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_prio_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_pend_list, id);
+ if (t)
+ return (t);
+
+ list_for_each(l, &tq->tq_active_list) {
+ tqt = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (tqt->tqt_id == id) {
+ /*
+ * Instead of returning tqt_task, we just return a non
+ * NULL value to prevent misuse, since tqt_task only
+ * has two valid fields.
+ */
+ return (ERR_PTR(-EBUSY));
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id. As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists. As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads. This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists. This value is stored
+ * with the taskq as the lowest id. It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented. Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones. Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
+static int
+taskq_wait_id_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (taskq_find(tq, id) == NULL);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_id() function blocks until the passed task id completes.
+ * This does not guarantee that all lower task ids have completed.
+ */
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_id);
+
+static int
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (id < tq->tq_lowest_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed. Note that all
+ * task id's are assigned monotonically at dispatch time. Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
+{
+ id = id ? id : tq->tq_next_id - 1;
+ wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_outstanding);
+
+static int
+taskq_wait_check(taskq_t *tq)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (tq->tq_lowest_id == tq->tq_next_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
+}
+EXPORT_SYMBOL(taskq_wait);
+
+int
+taskq_member(taskq_t *tq, kthread_t *t)
+{
+ return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
+}
+EXPORT_SYMBOL(taskq_member);
+
+/*
+ * Cancel an already dispatched task given the task id. Still pending tasks
+ * will be immediately canceled, and if the task is active the function will
+ * block until it completes. Preallocated tasks which are canceled must be
+ * freed by the caller.
+ */
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_ent_t *t;
+ int rc = ENOENT;
+ unsigned long flags;
+
+ ASSERT(tq);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ t = taskq_find(tq, id);
+ if (t && t != ERR_PTR(-EBUSY)) {
+ list_del_init(&t->tqent_list);
+ t->tqent_flags |= TQENT_FLAG_CANCEL;
+
+ /*
+ * When canceling the lowest outstanding task id we
+ * must recalculate the new lowest outstanding id.
+ */
+ if (tq->tq_lowest_id == t->tqent_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
+ }
+
+ /*
+ * The task_expire() function takes the tq->tq_lock so drop
+ * drop the lock before synchronously cancelling the timer.
+ */
+ if (timer_pending(&t->tqent_timer)) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ del_timer_sync(&t->tqent_timer);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ rc = 0;
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ if (t == ERR_PTR(-EBUSY)) {
+ taskq_wait_id(tq, id);
+ rc = EBUSY;
+ }
+
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_cancel_id);
+
+static int taskq_thread_spawn(taskq_t *tq);
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *t;
+ taskqid_t rc = TASKQID_INVALID;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ /* Do not queue the task unless there is idle thread for it */
+ ASSERT(tq->tq_nactive <= tq->tq_nthreads);
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out;
+ }
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
+ if (flags & TQ_NOQUEUE)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+ /* Queue to the priority list instead of the pending list */
+ else if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+ t->tqent_timer.data = 0;
+#endif
+ t->tqent_timer.function = NULL;
+ t->tqent_timer.expires = 0;
+ t->tqent_birth = jiffies;
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch);
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+ uint_t flags, clock_t expire_time)
+{
+ taskqid_t rc = TASKQID_INVALID;
+ taskq_ent_t *t;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the delay list for subsequent execution */
+ list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+ t->tqent_timer.data = (unsigned long)t;
+#endif
+ t->tqent_timer.function = task_expire;
+ t->tqent_timer.expires = (unsigned long)expire_time;
+ add_timer(&t->tqent_timer);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch_delay);
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+ taskq_ent_t *t)
+{
+ unsigned long irqflags;
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE)) {
+ t->tqent_id = TASKQID_INVALID;
+ goto out;
+ }
+
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out2;
+ flags |= TQ_FRONT;
+ }
+
+ spin_lock(&t->tqent_lock);
+
+ /*
+ * Make sure the entry is not on some other taskq; it is important to
+ * ASSERT() under lock
+ */
+ ASSERT(taskq_empty_ent(t));
+
+ /*
+ * Mark it as a prealloc'd task. This is important
+ * to ensure that we don't free it later.
+ */
+ t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+ /* Queue to the priority list instead of the pending list */
+ if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_birth = jiffies;
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+out2:
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+}
+EXPORT_SYMBOL(taskq_dispatch_ent);
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+ return (list_empty(&t->tqent_list));
+}
+EXPORT_SYMBOL(taskq_empty_ent);
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+ spin_lock_init(&t->tqent_lock);
+ init_waitqueue_head(&t->tqent_waitq);
+#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+ timer_setup(&t->tqent_timer, NULL, 0);
+#else
+ init_timer(&t->tqent_timer);
+#endif
+ INIT_LIST_HEAD(&t->tqent_list);
+ t->tqent_id = 0;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+ t->tqent_taskq = NULL;
+}
+EXPORT_SYMBOL(taskq_init_ent);
+
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+ struct list_head *list;
+
+ if (!list_empty(&tq->tq_prio_list))
+ list = &tq->tq_prio_list;
+ else if (!list_empty(&tq->tq_pend_list))
+ list = &tq->tq_pend_list;
+ else
+ return (NULL);
+
+ return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+ taskq_t *tq = (taskq_t *)arg;
+ unsigned long flags;
+
+ if (taskq_thread_create(tq) == NULL) {
+ /* restore spawning count if failed */
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nspawn--;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
+ * number of threads is insufficient to handle the pending tasks. These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim. The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq)
+{
+ int spawning = 0;
+
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+ (tq->tq_flags & TASKQ_ACTIVE)) {
+ spawning = (++tq->tq_nspawn);
+ taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+ tq, TQ_NOSLEEP);
+ }
+
+ return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks. This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+ tqt_thread_list) == tqt)
+ return (0);
+
+ return
+ ((tq->tq_nspawn == 0) && /* No threads are being spawned */
+ (tq->tq_nactive == 0) && /* No threads are handling tasks */
+ (tq->tq_nthreads > 1) && /* More than 1 thread is running */
+ (!taskq_next_ent(tq)) && /* There are no pending tasks */
+ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+}
+
+static int
+taskq_thread(void *args)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ sigset_t blocked;
+ taskq_thread_t *tqt = args;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ int seq_tasks = 0;
+ unsigned long flags;
+ taskq_ent_t dup_task = {};
+
+ ASSERT(tqt);
+ ASSERT(tqt->tqt_tq);
+ tq = tqt->tqt_tq;
+ current->flags |= PF_NOFREEZE;
+
+ (void) spl_fstrans_mark();
+
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ tsd_set(taskq_tsd, tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /*
+ * If we are dynamically spawned, decrease spawning count. Note that
+ * we could be created during taskq_create, in which case we shouldn't
+ * do the decrement. But it's fine because taskq_create will reset
+ * tq_nspawn later.
+ */
+ if (tq->tq_flags & TASKQ_DYNAMIC)
+ tq->tq_nspawn--;
+
+ /* Immediately exit if more threads than allowed were created. */
+ if (tq->tq_nthreads >= tq->tq_maxthreads)
+ goto error;
+
+ tq->tq_nthreads++;
+ list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
+ wake_up(&tq->tq_wait_waitq);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+
+ if (list_empty(&tq->tq_pend_list) &&
+ list_empty(&tq->tq_prio_list)) {
+
+ if (taskq_thread_should_stop(tq, tqt)) {
+ wake_up_all(&tq->tq_wait_waitq);
+ break;
+ }
+
+ add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ schedule();
+ seq_tasks = 0;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ remove_wait_queue(&tq->tq_work_waitq, &wait);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if ((t = taskq_next_ent(tq)) != NULL) {
+ list_del_init(&t->tqent_list);
+
+ /*
+ * A TQENT_FLAG_PREALLOC task may be reused or freed
+ * during the task function call. Store tqent_id and
+ * tqent_flags here.
+ *
+ * Also use an on stack taskq_ent_t for tqt_task
+ * assignment in this case. We only populate the two
+ * fields used by the only user in taskq proc file.
+ */
+ tqt->tqt_id = t->tqent_id;
+ tqt->tqt_flags = t->tqent_flags;
+
+ if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
+ dup_task.tqent_func = t->tqent_func;
+ dup_task.tqent_arg = t->tqent_arg;
+ t = &dup_task;
+ }
+ tqt->tqt_task = t;
+
+ taskq_insert_in_order(tq, tqt);
+ tq->tq_nactive++;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ /* Perform the requested task */
+ t->tqent_func(t->tqent_arg);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nactive--;
+ list_del_init(&tqt->tqt_active_list);
+ tqt->tqt_task = NULL;
+
+ /* For prealloc'd tasks, we don't free anything. */
+ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ /*
+ * When the current lowest outstanding taskqid is
+ * done calculate the new lowest outstanding id
+ */
+ if (tq->tq_lowest_id == tqt->tqt_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
+ }
+
+ /* Spawn additional taskq threads if required. */
+ if ((++seq_tasks) > spl_taskq_thread_sequential &&
+ taskq_thread_spawn(tq))
+ seq_tasks = 0;
+
+ tqt->tqt_id = TASKQID_INVALID;
+ tqt->tqt_flags = 0;
+ wake_up_all(&tq->tq_wait_waitq);
+ } else {
+ if (taskq_thread_should_stop(tq, tqt))
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ }
+
+ __set_current_state(TASK_RUNNING);
+ tq->tq_nthreads--;
+ list_del_init(&tqt->tqt_thread_list);
+error:
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ tsd_set(taskq_tsd, NULL);
+
+ return (0);
+}
+
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+ static int last_used_cpu = 0;
+ taskq_thread_t *tqt;
+
+ tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+ INIT_LIST_HEAD(&tqt->tqt_thread_list);
+ INIT_LIST_HEAD(&tqt->tqt_active_list);
+ tqt->tqt_tq = tq;
+ tqt->tqt_id = TASKQID_INVALID;
+
+ tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+ "%s", tq->tq_name);
+ if (tqt->tqt_thread == NULL) {
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ return (NULL);
+ }
+
+ if (spl_taskq_thread_bind) {
+ last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+ kthread_bind(tqt->tqt_thread, last_used_cpu);
+ }
+
+ if (spl_taskq_thread_priority)
+ set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+ wake_up_process(tqt->tqt_thread);
+
+ return (tqt);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags)
+{
+ taskq_t *tq;
+ taskq_thread_t *tqt;
+ int count = 0, rc = 0, i;
+ unsigned long irqflags;
+
+ ASSERT(name != NULL);
+ ASSERT(minalloc >= 0);
+ ASSERT(maxalloc <= INT_MAX);
+ ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
+
+ /* Scale the number of threads using nthreads as a percentage */
+ if (flags & TASKQ_THREADS_CPU_PCT) {
+ ASSERT(nthreads <= 100);
+ ASSERT(nthreads >= 0);
+ nthreads = MIN(nthreads, 100);
+ nthreads = MAX(nthreads, 0);
+ nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
+ }
+
+ tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
+ if (tq == NULL)
+ return (NULL);
+
+ spin_lock_init(&tq->tq_lock);
+ INIT_LIST_HEAD(&tq->tq_thread_list);
+ INIT_LIST_HEAD(&tq->tq_active_list);
+ tq->tq_name = strdup(name);
+ tq->tq_nactive = 0;
+ tq->tq_nthreads = 0;
+ tq->tq_nspawn = 0;
+ tq->tq_maxthreads = nthreads;
+ tq->tq_pri = pri;
+ tq->tq_minalloc = minalloc;
+ tq->tq_maxalloc = maxalloc;
+ tq->tq_nalloc = 0;
+ tq->tq_flags = (flags | TASKQ_ACTIVE);
+ tq->tq_next_id = TASKQID_INITIAL;
+ tq->tq_lowest_id = TASKQID_INITIAL;
+ INIT_LIST_HEAD(&tq->tq_free_list);
+ INIT_LIST_HEAD(&tq->tq_pend_list);
+ INIT_LIST_HEAD(&tq->tq_prio_list);
+ INIT_LIST_HEAD(&tq->tq_delay_list);
+ init_waitqueue_head(&tq->tq_work_waitq);
+ init_waitqueue_head(&tq->tq_wait_waitq);
+ tq->tq_lock_class = TQ_LOCK_GENERAL;
+ INIT_LIST_HEAD(&tq->tq_taskqs);
+
+ if (flags & TASKQ_PREPOPULATE) {
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ for (i = 0; i < minalloc; i++)
+ task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
+ &irqflags));
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ }
+
+ if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+ nthreads = 1;
+
+ for (i = 0; i < nthreads; i++) {
+ tqt = taskq_thread_create(tq);
+ if (tqt == NULL)
+ rc = 1;
+ else
+ count++;
+ }
+
+ /* Wait for all threads to be started before potential destroy */
+ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
+ /*
+ * taskq_thread might have touched nspawn, but we don't want them to
+ * because they're not dynamically spawned. So we reset it to 0
+ */
+ tq->tq_nspawn = 0;
+
+ if (rc) {
+ taskq_destroy(tq);
+ tq = NULL;
+ } else {
+ down_write(&tq_list_sem);
+ tq->tq_instance = taskq_find_by_name(name) + 1;
+ list_add_tail(&tq->tq_taskqs, &tq_list);
+ up_write(&tq_list_sem);
+ }
+
+ return (tq);
+}
+EXPORT_SYMBOL(taskq_create);
+
+void
+taskq_destroy(taskq_t *tq)
+{
+ struct task_struct *thread;
+ taskq_thread_t *tqt;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ tq->tq_flags &= ~TASKQ_ACTIVE;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ /*
+ * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+ * new worker threads be spawned for dynamic taskq.
+ */
+ if (dynamic_taskq != NULL)
+ taskq_wait_outstanding(dynamic_taskq, 0);
+
+ taskq_wait(tq);
+
+ /* remove taskq from global list used by the kstats */
+ down_write(&tq_list_sem);
+ list_del(&tq->tq_taskqs);
+ up_write(&tq_list_sem);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /* wait for spawning threads to insert themselves to the list */
+ while (tq->tq_nspawn) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ schedule_timeout_interruptible(1);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ /*
+ * Signal each thread to exit and block until it does. Each thread
+ * is responsible for removing itself from the list and freeing its
+ * taskq_thread_t. This allows for idle threads to opt to remove
+ * themselves from the taskq. They can be recreated as needed.
+ */
+ while (!list_empty(&tq->tq_thread_list)) {
+ tqt = list_entry(tq->tq_thread_list.next,
+ taskq_thread_t, tqt_thread_list);
+ thread = tqt->tqt_thread;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kthread_stop(thread);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ while (!list_empty(&tq->tq_free_list)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ list_del_init(&t->tqent_list);
+ task_free(tq, t);
+ }
+
+ ASSERT0(tq->tq_nthreads);
+ ASSERT0(tq->tq_nalloc);
+ ASSERT0(tq->tq_nspawn);
+ ASSERT(list_empty(&tq->tq_thread_list));
+ ASSERT(list_empty(&tq->tq_active_list));
+ ASSERT(list_empty(&tq->tq_free_list));
+ ASSERT(list_empty(&tq->tq_pend_list));
+ ASSERT(list_empty(&tq->tq_prio_list));
+ ASSERT(list_empty(&tq->tq_delay_list));
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ strfree(tq->tq_name);
+ kmem_free(tq, sizeof (taskq_t));
+}
+EXPORT_SYMBOL(taskq_destroy);
+
+
+static unsigned int spl_taskq_kick = 0;
+
+/*
+ * 2.6.36 API Change
+ * module_param_cb is introduced to take kernel_param_ops and
+ * module_param_call is marked as obsolete. Also set and get operations
+ * were changed to take a 'const struct kernel_param *'.
+ */
+static int
+#ifdef module_param_cb
+param_set_taskq_kick(const char *val, const struct kernel_param *kp)
+#else
+param_set_taskq_kick(const char *val, struct kernel_param *kp)
+#endif
+{
+ int ret;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ret = param_set_uint(val, kp);
+ if (ret < 0 || !spl_taskq_kick)
+ return (ret);
+ /* reset value */
+ spl_taskq_kick = 0;
+
+ down_read(&tq_list_sem);
+ list_for_each_entry(tq, &tq_list, tq_taskqs) {
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ /* Check if the first pending is older than 5 seconds */
+ t = taskq_next_ent(tq);
+ if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
+ (void) taskq_thread_spawn(tq);
+ printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
+ tq->tq_name, tq->tq_instance);
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+ up_read(&tq_list_sem);
+ return (ret);
+}
+
+#ifdef module_param_cb
+static const struct kernel_param_ops param_ops_taskq_kick = {
+ .set = param_set_taskq_kick,
+ .get = param_get_uint,
+};
+module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
+#else
+module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
+ &spl_taskq_kick, 0644);
+#endif
+MODULE_PARM_DESC(spl_taskq_kick,
+ "Write nonzero to kick stuck taskqs to spawn more threads");
+
+int
+spl_taskq_init(void)
+{
+ tsd_create(&taskq_tsd, NULL);
+
+ system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_taskq == NULL)
+ return (1);
+
+ system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_delay_taskq == NULL) {
+ taskq_destroy(system_taskq);
+ return (1);
+ }
+
+ dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+ if (dynamic_taskq == NULL) {
+ taskq_destroy(system_taskq);
+ taskq_destroy(system_delay_taskq);
+ return (1);
+ }
+
+ /*
+ * This is used to annotate tq_lock, so
+ * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
+ * does not trigger a lockdep warning re: possible recursive locking
+ */
+ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
+
+ return (0);
+}
+
+void
+spl_taskq_fini(void)
+{
+ taskq_destroy(dynamic_taskq);
+ dynamic_taskq = NULL;
+
+ taskq_destroy(system_delay_taskq);
+ system_delay_taskq = NULL;
+
+ taskq_destroy(system_taskq);
+ system_taskq = NULL;
+
+ tsd_destroy(&taskq_tsd);
+}
diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
new file mode 100644
index 000000000..d441ad65f
--- /dev/null
+++ b/module/spl/spl-thread.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Thread Implementation.
+ */
+
+#include <sys/thread.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+typedef struct thread_priv_s {
+ unsigned long tp_magic; /* Magic */
+ int tp_name_size; /* Name size */
+ char *tp_name; /* Name (without _thread suffix) */
+ void (*tp_func)(void *); /* Registered function */
+ void *tp_args; /* Args to be passed to function */
+ size_t tp_len; /* Len to be passed to function */
+ int tp_state; /* State to start thread at */
+ pri_t tp_pri; /* Priority to start threat at */
+} thread_priv_t;
+
+static int
+thread_generic_wrapper(void *arg)
+{
+ thread_priv_t *tp = (thread_priv_t *)arg;
+ void (*func)(void *);
+ void *args;
+
+ ASSERT(tp->tp_magic == TP_MAGIC);
+ func = tp->tp_func;
+ args = tp->tp_args;
+ set_current_state(tp->tp_state);
+ set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+ kmem_free(tp->tp_name, tp->tp_name_size);
+ kmem_free(tp, sizeof (thread_priv_t));
+
+ if (func)
+ func(args);
+
+ return (0);
+}
+
+void
+__thread_exit(void)
+{
+ tsd_exit();
+ complete_and_exit(NULL, 0);
+ /* Unreachable */
+}
+EXPORT_SYMBOL(__thread_exit);
+
+/*
+ * thread_create() may block forever if it cannot create a thread or
+ * allocate memory. This is preferable to returning a NULL which Solaris
+ * style callers likely never check for... since it can't fail.
+ */
+kthread_t *
+__thread_create(caddr_t stk, size_t stksize, thread_func_t func,
+ const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri)
+{
+ thread_priv_t *tp;
+ struct task_struct *tsk;
+ char *p;
+
+ /* Option pp is simply ignored */
+ /* Variable stack size unsupported */
+ ASSERT(stk == NULL);
+
+ tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
+ if (tp == NULL)
+ return (NULL);
+
+ tp->tp_magic = TP_MAGIC;
+ tp->tp_name_size = strlen(name) + 1;
+
+ tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE);
+ if (tp->tp_name == NULL) {
+ kmem_free(tp, sizeof (thread_priv_t));
+ return (NULL);
+ }
+
+ strncpy(tp->tp_name, name, tp->tp_name_size);
+
+ /*
+ * Strip trailing "_thread" from passed name which will be the func
+ * name since the exposed API has no parameter for passing a name.
+ */
+ p = strstr(tp->tp_name, "_thread");
+ if (p)
+ p[0] = '\0';
+
+ tp->tp_func = func;
+ tp->tp_args = args;
+ tp->tp_len = len;
+ tp->tp_state = state;
+ tp->tp_pri = pri;
+
+ tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp,
+ "%s", tp->tp_name);
+ if (IS_ERR(tsk))
+ return (NULL);
+
+ wake_up_process(tsk);
+ return ((kthread_t *)tsk);
+}
+EXPORT_SYMBOL(__thread_create);
+
+/*
+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for
+ * kthread_create() in which it is not killable and less likely
+ * to return -ENOMEM.
+ */
+struct task_struct *
+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
+{
+ struct task_struct *tsk;
+ va_list args;
+ char name[TASK_COMM_LEN];
+
+ va_start(args, namefmt);
+ vsnprintf(name, sizeof (name), namefmt, args);
+ va_end(args);
+ do {
+ tsk = kthread_create(func, data, "%s", name);
+ if (IS_ERR(tsk)) {
+ if (signal_pending(current)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ continue;
+ }
+ if (PTR_ERR(tsk) == -ENOMEM)
+ continue;
+ return (NULL);
+ } else
+ return (tsk);
+ } while (1);
+}
+EXPORT_SYMBOL(spl_kthread_create);
diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c
new file mode 100644
index 000000000..4c800292a
--- /dev/null
+++ b/module/spl/spl-tsd.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Solaris Porting Layer (SPL) Thread Specific Data Implementation.
+ *
+ * Thread specific data has implemented using a hash table, this avoids
+ * the need to add a member to the task structure and allows maximum
+ * portability between kernels. This implementation has been optimized
+ * to keep the tsd_set() and tsd_get() times as small as possible.
+ *
+ * The majority of the entries in the hash table are for specific tsd
+ * entries. These entries are hashed by the product of their key and
+ * pid because by design the key and pid are guaranteed to be unique.
+ * Their product also has the desirable properly that it will be uniformly
+ * distributed over the hash bins providing neither the pid nor key is zero.
+ * Under linux the zero pid is always the init process and thus won't be
+ * used, and this implementation is careful to never to assign a zero key.
+ * By default the hash table is sized to 512 bins which is expected to
+ * be sufficient for light to moderate usage of thread specific data.
+ *
+ * The hash table contains two additional type of entries. They first
+ * type is entry is called a 'key' entry and it is added to the hash during
+ * tsd_create(). It is used to store the address of the destructor function
+ * and it is used as an anchor point. All tsd entries which use the same
+ * key will be linked to this entry. This is used during tsd_destory() to
+ * quickly call the destructor function for all tsd associated with the key.
+ * The 'key' entry may be looked up with tsd_hash_search() by passing the
+ * key you wish to lookup and DTOR_PID constant as the pid.
+ *
+ * The second type of entry is called a 'pid' entry and it is added to the
+ * hash the first time a process set a key. The 'pid' entry is also used
+ * as an anchor and all tsd for the process will be linked to it. This
+ * list is using during tsd_exit() to ensure all registered destructors
+ * are run for the process. The 'pid' entry may be looked up with
+ * tsd_hash_search() by passing the PID_KEY constant as the key, and
+ * the process pid. Note that tsd_exit() is called by thread_exit()
+ * so if your using the Solaris thread API you should not need to call
+ * tsd_exit() directly.
+ *
+ */
+
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/tsd.h>
+#include <linux/hash.h>
+
+typedef struct tsd_hash_bin {
+ spinlock_t hb_lock;
+ struct hlist_head hb_head;
+} tsd_hash_bin_t;
+
+typedef struct tsd_hash_table {
+ spinlock_t ht_lock;
+ uint_t ht_bits;
+ uint_t ht_key;
+ tsd_hash_bin_t *ht_bins;
+} tsd_hash_table_t;
+
+typedef struct tsd_hash_entry {
+ uint_t he_key;
+ pid_t he_pid;
+ dtor_func_t he_dtor;
+ void *he_value;
+ struct hlist_node he_list;
+ struct list_head he_key_list;
+ struct list_head he_pid_list;
+} tsd_hash_entry_t;
+
+static tsd_hash_table_t *tsd_hash_table = NULL;
+
+
+/*
+ * tsd_hash_search - searches hash table for tsd_hash_entry
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static tsd_hash_entry_t *
+tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
+{
+ struct hlist_node *node;
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+ hlist_for_each(node, &bin->hb_head) {
+ entry = list_entry(node, tsd_hash_entry_t, he_list);
+ if ((entry->he_key == key) && (entry->he_pid == pid)) {
+ spin_unlock(&bin->hb_lock);
+ return (entry);
+ }
+ }
+
+ spin_unlock(&bin->hb_lock);
+ return (NULL);
+}
+
+/*
+ * tsd_hash_dtor - call the destructor and free all entries on the list
+ * @work: list of hash entries
+ *
+ * For a list of entries which have all already been removed from the
+ * hash call their registered destructor then free the associated memory.
+ */
+static void
+tsd_hash_dtor(struct hlist_head *work)
+{
+ tsd_hash_entry_t *entry;
+
+ while (!hlist_empty(work)) {
+ entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
+ hlist_del(&entry->he_list);
+
+ if (entry->he_dtor && entry->he_pid != DTOR_PID)
+ entry->he_dtor(entry->he_value);
+
+ kmem_free(entry, sizeof (tsd_hash_entry_t));
+ }
+}
+
+/*
+ * tsd_hash_add - adds an entry to hash table
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ *
+ * The caller is responsible for ensuring the unique key/pid do not
+ * already exist in the hash table. This possible because all entries
+ * are thread specific thus a concurrent thread will never attempt to
+ * add this key/pid. Because multiple bins must be checked to add
+ * links to the dtor and pid entries the entire table is locked.
+ */
+static int
+tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
+{
+ tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int rc = 0;
+
+ ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+
+ /* New entry allocate structure, set value, and add to hash */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ entry->he_key = key;
+ entry->he_pid = pid;
+ entry->he_value = value;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ spin_lock(&table->ht_lock);
+
+ /* Destructor entry must exist for all valid keys */
+ dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
+ ASSERT3P(dtor_entry, !=, NULL);
+ entry->he_dtor = dtor_entry->he_dtor;
+
+ /* Process entry must exist for all valid processes */
+ pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
+ ASSERT3P(pid_entry, !=, NULL);
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ /* Add to the hash, key, and pid lists */
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+ list_add(&entry->he_key_list, &dtor_entry->he_key_list);
+ list_add(&entry->he_pid_list, &pid_entry->he_pid_list);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (rc);
+}
+
+/*
+ * tsd_hash_add_key - adds a destructor entry to the hash table
+ * @table: hash table
+ * @keyp: search key
+ * @dtor: key destructor
+ *
+ * For every unique key there is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this key are linked
+ * to this anchor via the 'he_key_list' list head. On return they keyp
+ * will be set to the next available key for the hash table.
+ */
+static int
+tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
+{
+ tsd_hash_entry_t *tmp_entry, *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int keys_checked = 0;
+
+ ASSERT3P(table, !=, NULL);
+
+ /* Allocate entry to be used as a destructor for this key */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ /* Determine next available key value */
+ spin_lock(&table->ht_lock);
+ do {
+ /* Limited to TSD_KEYS_MAX concurrent unique keys */
+ if (table->ht_key++ > TSD_KEYS_MAX)
+ table->ht_key = 1;
+
+ /* Ensure failure when all TSD_KEYS_MAX keys are in use */
+ if (keys_checked++ >= TSD_KEYS_MAX) {
+ spin_unlock(&table->ht_lock);
+ return (ENOENT);
+ }
+
+ tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
+ } while (tmp_entry);
+
+ /* Add destructor entry in to hash table */
+ entry->he_key = *keyp = table->ht_key;
+ entry->he_pid = DTOR_PID;
+ entry->he_dtor = dtor;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_add_pid - adds a process entry to the hash table
+ * @table: hash table
+ * @pid: search pid
+ *
+ * For every process these is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this process are
+ * linked to this anchor via the 'he_pid_list' list head.
+ */
+static int
+tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
+{
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ /* Allocate entry to be used as the process reference */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ spin_lock(&table->ht_lock);
+ entry->he_key = PID_KEY;
+ entry->he_pid = pid;
+ entry->he_dtor = NULL;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_del - delete an entry from hash table, key, and pid lists
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static void
+tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
+{
+ hlist_del(&entry->he_list);
+ list_del_init(&entry->he_key_list);
+ list_del_init(&entry->he_pid_list);
+}
+
+/*
+ * tsd_hash_table_init - allocate a hash table
+ * @bits: hash table size
+ *
+ * A hash table with 2^bits bins will be created, it may not be resized
+ * after the fact and must be free'd with tsd_hash_table_fini().
+ */
+static tsd_hash_table_t *
+tsd_hash_table_init(uint_t bits)
+{
+ tsd_hash_table_t *table;
+ int hash, size = (1 << bits);
+
+ table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
+ if (table == NULL)
+ return (NULL);
+
+ table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
+ if (table->ht_bins == NULL) {
+ kmem_free(table, sizeof (tsd_hash_table_t));
+ return (NULL);
+ }
+
+ for (hash = 0; hash < size; hash++) {
+ spin_lock_init(&table->ht_bins[hash].hb_lock);
+ INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
+ }
+
+ spin_lock_init(&table->ht_lock);
+ table->ht_bits = bits;
+ table->ht_key = 1;
+
+ return (table);
+}
+
+/*
+ * tsd_hash_table_fini - free a hash table
+ * @table: hash table
+ *
+ * Free a hash table allocated by tsd_hash_table_init(). If the hash
+ * table is not empty this function will call the proper destructor for
+ * all remaining entries before freeing the memory used by those entries.
+ */
+static void
+tsd_hash_table_fini(tsd_hash_table_t *table)
+{
+ HLIST_HEAD(work);
+ tsd_hash_bin_t *bin;
+ tsd_hash_entry_t *entry;
+ int size, i;
+
+ ASSERT3P(table, !=, NULL);
+ spin_lock(&table->ht_lock);
+ for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
+ bin = &table->ht_bins[i];
+ spin_lock(&bin->hb_lock);
+ while (!hlist_empty(&bin->hb_head)) {
+ entry = hlist_entry(bin->hb_head.first,
+ tsd_hash_entry_t, he_list);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ }
+ spin_unlock(&bin->hb_lock);
+ }
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+ kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+ ASSERT3P(entry, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ /* save the possible pid_entry */
+ pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+ he_pid_list);
+
+ /* remove entry */
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+
+ /* if pid_entry is indeed pid_entry, then remove it if it's empty */
+ if (pid_entry->he_key == PID_KEY &&
+ list_empty(&pid_entry->he_pid_list)) {
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ }
+
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+
+/*
+ * tsd_set - set thread specific data
+ * @key: lookup key
+ * @value: value to set
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(), protected
+ * from racing tsd_get() or tsd_set() because it is thread specific.
+ * This function has been optimized to be fast for the update case.
+ * When setting the tsd initially it will be slower due to additional
+ * required locking and potential memory allocations.
+ */
+int
+tsd_set(uint_t key, void *value)
+{
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *entry;
+ pid_t pid;
+ int rc;
+ /* mark remove if value is NULL */
+ boolean_t remove = (value == NULL);
+
+ table = tsd_hash_table;
+ pid = curthread->pid;
+ ASSERT3P(table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (EINVAL);
+
+ /* Entry already exists in hash table update value */
+ entry = tsd_hash_search(table, key, pid);
+ if (entry) {
+ entry->he_value = value;
+ /* remove the entry */
+ if (remove)
+ tsd_remove_entry(entry);
+ return (0);
+ }
+
+ /* don't create entry if value is NULL */
+ if (remove)
+ return (0);
+
+ /* Add a process entry to the hash if not yet exists */
+ entry = tsd_hash_search(table, PID_KEY, pid);
+ if (entry == NULL) {
+ rc = tsd_hash_add_pid(table, pid);
+ if (rc)
+ return (rc);
+ }
+
+ rc = tsd_hash_add(table, key, pid, value);
+ return (rc);
+}
+EXPORT_SYMBOL(tsd_set);
+
+/*
+ * tsd_get - get thread specific data
+ * @key: lookup key
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get(uint_t key)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get);
+
+/*
+ * tsd_get_by_thread - get thread specific data for specified thread
+ * @key: lookup key
+ * @thread: thread to lookup
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get_by_thread(uint_t key, kthread_t *thread)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get_by_thread);
+
+/*
+ * tsd_create - create thread specific data key
+ * @keyp: lookup key address
+ * @dtor: destructor called during tsd_destroy() or tsd_exit()
+ *
+ * Provided key must be set to 0 or it assumed to be already in use.
+ * The dtor is allowed to be NULL in which case no additional cleanup
+ * for the data is performed during tsd_destroy() or tsd_exit().
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_create(uint_t *keyp, dtor_func_t dtor)
+{
+ ASSERT3P(keyp, !=, NULL);
+ if (*keyp)
+ return;
+
+ (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+}
+EXPORT_SYMBOL(tsd_create);
+
+/*
+ * tsd_destroy - destroy thread specific data
+ * @keyp: lookup key address
+ *
+ * Destroys the thread specific data on all threads which use this key.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_destroy(uint_t *keyp)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *dtor_entry, *entry;
+ tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
+ if (dtor_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All threads which use this key must be linked off of the
+ * DTOR_PID entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+ while (!list_empty(&dtor_entry->he_key_list)) {
+ entry = list_entry(dtor_entry->he_key_list.next,
+ tsd_hash_entry_t, he_key_list);
+ ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
+ ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)dtor_entry->he_key *
+ (ulong_t)dtor_entry->he_pid, table->ht_bits);
+ dtor_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&dtor_entry_bin->hb_lock);
+ tsd_hash_del(table, dtor_entry);
+ hlist_add_head(&dtor_entry->he_list, &work);
+ spin_unlock(&dtor_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ *keyp = 0;
+}
+EXPORT_SYMBOL(tsd_destroy);
+
+/*
+ * tsd_exit - destroys all thread specific data for this thread
+ *
+ * Destroys all the thread specific data for this thread.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_exit(void)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry, *entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
+ if (pid_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All keys associated with this pid must be linked off of the
+ * PID_KEY entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+
+ while (!list_empty(&pid_entry->he_pid_list)) {
+ entry = list_entry(pid_entry->he_pid_list.next,
+ tsd_hash_entry_t, he_pid_list);
+ ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+EXPORT_SYMBOL(tsd_exit);
+
+int
+spl_tsd_init(void)
+{
+ tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
+ if (tsd_hash_table == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_tsd_fini(void)
+{
+ tsd_hash_table_fini(tsd_hash_table);
+ tsd_hash_table = NULL;
+}
diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c
new file mode 100644
index 000000000..e1a84a911
--- /dev/null
+++ b/module/spl/spl-vmem.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <linux/module.h>
+
+vmem_t *heap_arena = NULL;
+EXPORT_SYMBOL(heap_arena);
+
+vmem_t *zio_alloc_arena = NULL;
+EXPORT_SYMBOL(zio_alloc_arena);
+
+vmem_t *zio_arena = NULL;
+EXPORT_SYMBOL(zio_arena);
+
+#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */
+
+/*
+ * Return approximate virtual memory usage based on these assumptions:
+ *
+ * 1) The major SPL consumer of virtual memory is the kmem cache.
+ * 2) Memory allocated with vmem_alloc() is short lived and can be ignored.
+ * 3) Allow a 4MB floor as a generous pad given normal consumption.
+ * 4) The spl_kmem_cache_sem only contends with cache create/destroy.
+ */
+size_t
+vmem_size(vmem_t *vmp, int typemask)
+{
+ spl_kmem_cache_t *skc;
+ size_t alloc = VMEM_FLOOR_SIZE;
+
+ if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE))
+ return (VMALLOC_TOTAL);
+
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ if (skc->skc_flags & KMC_VMEM)
+ alloc += skc->skc_slab_size * skc->skc_slab_total;
+ }
+ up_read(&spl_kmem_cache_sem);
+
+ if (typemask & VMEM_ALLOC)
+ return (MIN(alloc, VMALLOC_TOTAL));
+ else if (typemask & VMEM_FREE)
+ return (MAX(VMALLOC_TOTAL - alloc, 0));
+ else
+ return (0);
+}
+EXPORT_SYMBOL(vmem_size);
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+int
+spl_vmem_init(void)
+{
+ return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c
new file mode 100644
index 000000000..28ce21276
--- /dev/null
+++ b/module/spl/spl-vnode.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Vnode Implementation.
+ */
+
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/kmem_cache.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_FDTABLE_HEADER
+#include <linux/fdtable.h>
+#endif
+
+vnode_t *rootdir = (vnode_t *)0xabcd1234;
+EXPORT_SYMBOL(rootdir);
+
+static spl_kmem_cache_t *vn_cache;
+static spl_kmem_cache_t *vn_file_cache;
+
+static DEFINE_SPINLOCK(vn_file_lock);
+static LIST_HEAD(vn_file_list);
+
+static int
+spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len)
+{
+ int error = -EOPNOTSUPP;
+
+#ifdef HAVE_FILE_FALLOCATE
+ if (fp->f_op->fallocate)
+ error = fp->f_op->fallocate(fp, mode, offset, len);
+#else
+#ifdef HAVE_INODE_FALLOCATE
+ if (fp->f_dentry && fp->f_dentry->d_inode &&
+ fp->f_dentry->d_inode->i_op->fallocate)
+ error = fp->f_dentry->d_inode->i_op->fallocate(
+ fp->f_dentry->d_inode, mode, offset, len);
+#endif /* HAVE_INODE_FALLOCATE */
+#endif /* HAVE_FILE_FALLOCATE */
+
+ return (error);
+}
+
+static int
+spl_filp_fsync(struct file *fp, int sync)
+{
+#ifdef HAVE_2ARGS_VFS_FSYNC
+ return (vfs_fsync(fp, sync));
+#else
+ return (vfs_fsync(fp, (fp)->f_dentry, sync));
+#endif /* HAVE_2ARGS_VFS_FSYNC */
+}
+
+static ssize_t
+spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_WRITE_PPOS)
+ return (kernel_write(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(get_ds());
+
+ ret = vfs_write(file, (__force const char __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+static ssize_t
+spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+ return (kernel_read(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(get_ds());
+
+ ret = vfs_read(file, (void __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+vtype_t
+vn_mode_to_vtype(mode_t mode)
+{
+ if (S_ISREG(mode))
+ return (VREG);
+
+ if (S_ISDIR(mode))
+ return (VDIR);
+
+ if (S_ISCHR(mode))
+ return (VCHR);
+
+ if (S_ISBLK(mode))
+ return (VBLK);
+
+ if (S_ISFIFO(mode))
+ return (VFIFO);
+
+ if (S_ISLNK(mode))
+ return (VLNK);
+
+ if (S_ISSOCK(mode))
+ return (VSOCK);
+
+ return (VNON);
+} /* vn_mode_to_vtype() */
+EXPORT_SYMBOL(vn_mode_to_vtype);
+
+mode_t
+vn_vtype_to_mode(vtype_t vtype)
+{
+ if (vtype == VREG)
+ return (S_IFREG);
+
+ if (vtype == VDIR)
+ return (S_IFDIR);
+
+ if (vtype == VCHR)
+ return (S_IFCHR);
+
+ if (vtype == VBLK)
+ return (S_IFBLK);
+
+ if (vtype == VFIFO)
+ return (S_IFIFO);
+
+ if (vtype == VLNK)
+ return (S_IFLNK);
+
+ if (vtype == VSOCK)
+ return (S_IFSOCK);
+
+ return (VNON);
+} /* vn_vtype_to_mode() */
+EXPORT_SYMBOL(vn_vtype_to_mode);
+
+vnode_t *
+vn_alloc(int flag)
+{
+ vnode_t *vp;
+
+ vp = kmem_cache_alloc(vn_cache, flag);
+ if (vp != NULL) {
+ vp->v_file = NULL;
+ vp->v_type = 0;
+ }
+
+ return (vp);
+} /* vn_alloc() */
+EXPORT_SYMBOL(vn_alloc);
+
+void
+vn_free(vnode_t *vp)
+{
+ kmem_cache_free(vn_cache, vp);
+} /* vn_free() */
+EXPORT_SYMBOL(vn_free);
+
+int
+vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp,
+ int x1, void *x2)
+{
+ struct file *fp;
+ struct kstat stat;
+ int rc, saved_umask = 0;
+ gfp_t saved_gfp;
+ vnode_t *vp;
+
+ ASSERT(flags & (FWRITE | FREAD));
+ ASSERT(seg == UIO_SYSSPACE);
+ ASSERT(vpp);
+ *vpp = NULL;
+
+ if (!(flags & FCREAT) && (flags & FWRITE))
+ flags |= FEXCL;
+
+ /*
+ * Note for filp_open() the two low bits must be remapped to mean:
+ * 01 - read-only -> 00 read-only
+ * 10 - write-only -> 01 write-only
+ * 11 - read-write -> 10 read-write
+ */
+ flags--;
+
+ if (flags & FCREAT)
+ saved_umask = xchg(&current->fs->umask, 0);
+
+ fp = filp_open(path, flags, mode);
+
+ if (flags & FCREAT)
+ (void) xchg(&current->fs->umask, saved_umask);
+
+ if (IS_ERR(fp))
+ return (-PTR_ERR(fp));
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat);
+#else
+ rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat);
+#endif
+ if (rc) {
+ filp_close(fp, 0);
+ return (-rc);
+ }
+
+ vp = vn_alloc(KM_SLEEP);
+ if (!vp) {
+ filp_close(fp, 0);
+ return (ENOMEM);
+ }
+
+ saved_gfp = mapping_gfp_mask(fp->f_mapping);
+ mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS));
+
+ mutex_enter(&vp->v_lock);
+ vp->v_type = vn_mode_to_vtype(stat.mode);
+ vp->v_file = fp;
+ vp->v_gfp_mask = saved_gfp;
+ *vpp = vp;
+ mutex_exit(&vp->v_lock);
+
+ return (0);
+} /* vn_open() */
+EXPORT_SYMBOL(vn_open);
+
+int
+vn_openat(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd)
+{
+ char *realpath;
+ int len, rc;
+
+ ASSERT(vp == rootdir);
+
+ len = strlen(path) + 2;
+ realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP));
+ if (!realpath)
+ return (ENOMEM);
+
+ (void) snprintf(realpath, len, "/%s", path);
+ rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2);
+ kfree(realpath);
+
+ return (rc);
+} /* vn_openat() */
+EXPORT_SYMBOL(vn_openat);
+
+int
+vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off,
+ uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp)
+{
+ struct file *fp = vp->v_file;
+ loff_t offset = off;
+ int rc;
+
+ ASSERT(uio == UIO_WRITE || uio == UIO_READ);
+ ASSERT(seg == UIO_SYSSPACE);
+ ASSERT((ioflag & ~FAPPEND) == 0);
+
+ if (ioflag & FAPPEND)
+ offset = fp->f_pos;
+
+ if (uio & UIO_WRITE)
+ rc = spl_kernel_write(fp, addr, len, &offset);
+ else
+ rc = spl_kernel_read(fp, addr, len, &offset);
+
+ fp->f_pos = offset;
+
+ if (rc < 0)
+ return (-rc);
+
+ if (residp) {
+ *residp = len - rc;
+ } else {
+ if (rc != len)
+ return (EIO);
+ }
+
+ return (0);
+} /* vn_rdwr() */
+EXPORT_SYMBOL(vn_rdwr);
+
+int
+vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4)
+{
+ int rc;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+
+ mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask);
+ rc = filp_close(vp->v_file, 0);
+ vn_free(vp);
+
+ return (-rc);
+} /* vn_close() */
+EXPORT_SYMBOL(vn_close);
+
+/*
+ * vn_seek() does not actually seek it only performs bounds checking on the
+ * proposed seek. We perform minimal checking and allow vn_rdwr() to catch
+ * anything more serious.
+ */
+int
+vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct)
+{
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+EXPORT_SYMBOL(vn_seek);
+
+int
+vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4)
+{
+ struct file *fp;
+ struct kstat stat;
+ int rc;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+ ASSERT(vap);
+
+ fp = vp->v_file;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat);
+#else
+ rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat);
+#endif
+ if (rc)
+ return (-rc);
+
+ vap->va_type = vn_mode_to_vtype(stat.mode);
+ vap->va_mode = stat.mode;
+ vap->va_uid = KUID_TO_SUID(stat.uid);
+ vap->va_gid = KGID_TO_SGID(stat.gid);
+ vap->va_fsid = 0;
+ vap->va_nodeid = stat.ino;
+ vap->va_nlink = stat.nlink;
+ vap->va_size = stat.size;
+ vap->va_blksize = stat.blksize;
+ vap->va_atime = stat.atime;
+ vap->va_mtime = stat.mtime;
+ vap->va_ctime = stat.ctime;
+ vap->va_rdev = stat.rdev;
+ vap->va_nblocks = stat.blocks;
+
+ return (0);
+}
+EXPORT_SYMBOL(vn_getattr);
+
+int
+vn_fsync(vnode_t *vp, int flags, void *x3, void *x4)
+{
+ int datasync = 0;
+ int error;
+ int fstrans;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+
+ if (flags & FDSYNC)
+ datasync = 1;
+
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ error = -spl_filp_fsync(vp->v_file, datasync);
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ return (error);
+} /* vn_fsync() */
+EXPORT_SYMBOL(vn_fsync);
+
+int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
+ offset_t offset, void *x6, void *x7)
+{
+ int error = EOPNOTSUPP;
+#ifdef FALLOC_FL_PUNCH_HOLE
+ int fstrans;
+#endif
+
+ if (cmd != F_FREESP || bfp->l_whence != 0)
+ return (EOPNOTSUPP);
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+ ASSERT(bfp->l_start >= 0 && bfp->l_len > 0);
+
+#ifdef FALLOC_FL_PUNCH_HOLE
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ /*
+ * When supported by the underlying file system preferentially
+ * use the fallocate() callback to preallocate the space.
+ */
+ error = -spl_filp_fallocate(vp->v_file,
+ FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ bfp->l_start, bfp->l_len);
+
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ if (error == 0)
+ return (0);
+#endif
+
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+ if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode &&
+ vp->v_file->f_dentry->d_inode->i_op &&
+ vp->v_file->f_dentry->d_inode->i_op->truncate_range) {
+ off_t end = bfp->l_start + bfp->l_len;
+ /*
+ * Judging from the code in shmem_truncate_range(),
+ * it seems the kernel expects the end offset to be
+ * inclusive and aligned to the end of a page.
+ */
+ if (end % PAGE_SIZE != 0) {
+ end &= ~(off_t)(PAGE_SIZE - 1);
+ if (end <= bfp->l_start)
+ return (0);
+ }
+ --end;
+
+ vp->v_file->f_dentry->d_inode->i_op->truncate_range(
+ vp->v_file->f_dentry->d_inode, bfp->l_start, end);
+
+ return (0);
+ }
+#endif
+
+ return (error);
+}
+EXPORT_SYMBOL(vn_space);
+
+/* Function must be called while holding the vn_file_lock */
+static file_t *
+file_find(int fd, struct task_struct *task)
+{
+ file_t *fp;
+
+ list_for_each_entry(fp, &vn_file_list, f_list) {
+ if (fd == fp->f_fd && fp->f_task == task) {
+ ASSERT(atomic_read(&fp->f_ref) != 0);
+ return (fp);
+ }
+ }
+
+ return (NULL);
+} /* file_find() */
+
+file_t *
+vn_getf(int fd)
+{
+ struct kstat stat;
+ struct file *lfp;
+ file_t *fp;
+ vnode_t *vp;
+ int rc = 0;
+
+ if (fd < 0)
+ return (NULL);
+
+ /* Already open just take an extra reference */
+ spin_lock(&vn_file_lock);
+
+ fp = file_find(fd, current);
+ if (fp) {
+ lfp = fget(fd);
+ fput(fp->f_file);
+ /*
+ * areleasef() can cause us to see a stale reference when
+ * userspace has reused a file descriptor before areleasef()
+ * has run. fput() the stale reference and replace it. We
+ * retain the original reference count such that the concurrent
+ * areleasef() will decrement its reference and terminate.
+ */
+ if (lfp != fp->f_file) {
+ fp->f_file = lfp;
+ fp->f_vnode->v_file = lfp;
+ }
+ atomic_inc(&fp->f_ref);
+ spin_unlock(&vn_file_lock);
+ return (fp);
+ }
+
+ spin_unlock(&vn_file_lock);
+
+ /* File was not yet opened create the object and setup */
+ fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP);
+ if (fp == NULL)
+ goto out;
+
+ mutex_enter(&fp->f_lock);
+
+ fp->f_fd = fd;
+ fp->f_task = current;
+ fp->f_offset = 0;
+ atomic_inc(&fp->f_ref);
+
+ lfp = fget(fd);
+ if (lfp == NULL)
+ goto out_mutex;
+
+ vp = vn_alloc(KM_SLEEP);
+ if (vp == NULL)
+ goto out_fget;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&lfp->f_path, &stat);
+#else
+ rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat);
+#endif
+ if (rc)
+ goto out_vnode;
+
+ mutex_enter(&vp->v_lock);
+ vp->v_type = vn_mode_to_vtype(stat.mode);
+ vp->v_file = lfp;
+ mutex_exit(&vp->v_lock);
+
+ fp->f_vnode = vp;
+ fp->f_file = lfp;
+
+ /* Put it on the tracking list */
+ spin_lock(&vn_file_lock);
+ list_add(&fp->f_list, &vn_file_list);
+ spin_unlock(&vn_file_lock);
+
+ mutex_exit(&fp->f_lock);
+ return (fp);
+
+out_vnode:
+ vn_free(vp);
+out_fget:
+ fput(lfp);
+out_mutex:
+ mutex_exit(&fp->f_lock);
+ kmem_cache_free(vn_file_cache, fp);
+out:
+ return (NULL);
+} /* getf() */
+EXPORT_SYMBOL(getf);
+
+static void releasef_locked(file_t *fp)
+{
+ ASSERT(fp->f_file);
+ ASSERT(fp->f_vnode);
+
+ /* Unlinked from list, no refs, safe to free outside mutex */
+ fput(fp->f_file);
+ vn_free(fp->f_vnode);
+
+ kmem_cache_free(vn_file_cache, fp);
+}
+
+void
+vn_releasef(int fd)
+{
+ areleasef(fd, P_FINFO(current));
+}
+EXPORT_SYMBOL(releasef);
+
+void
+vn_areleasef(int fd, uf_info_t *fip)
+{
+ file_t *fp;
+ struct task_struct *task = (struct task_struct *)fip;
+
+ if (fd < 0)
+ return;
+
+ spin_lock(&vn_file_lock);
+ fp = file_find(fd, task);
+ if (fp) {
+ atomic_dec(&fp->f_ref);
+ if (atomic_read(&fp->f_ref) > 0) {
+ spin_unlock(&vn_file_lock);
+ return;
+ }
+
+ list_del(&fp->f_list);
+ releasef_locked(fp);
+ }
+ spin_unlock(&vn_file_lock);
+} /* releasef() */
+EXPORT_SYMBOL(areleasef);
+
+
+static void
+#ifdef HAVE_SET_FS_PWD_WITH_CONST
+vn_set_fs_pwd(struct fs_struct *fs, const struct path *path)
+#else
+vn_set_fs_pwd(struct fs_struct *fs, struct path *path)
+#endif /* HAVE_SET_FS_PWD_WITH_CONST */
+{
+ struct path old_pwd;
+
+#ifdef HAVE_FS_STRUCT_SPINLOCK
+ spin_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ fs->pwd = *path;
+ path_get(path);
+ spin_unlock(&fs->lock);
+#else
+ write_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ fs->pwd = *path;
+ path_get(path);
+ write_unlock(&fs->lock);
+#endif /* HAVE_FS_STRUCT_SPINLOCK */
+
+ if (old_pwd.dentry)
+ path_put(&old_pwd);
+}
+
+int
+vn_set_pwd(const char *filename)
+{
+ struct path path;
+ mm_segment_t saved_fs;
+ int rc;
+
+ /*
+ * user_path_dir() and __user_walk() both expect 'filename' to be
+ * a user space address so we must briefly increase the data segment
+ * size to ensure strncpy_from_user() does not fail with -EFAULT.
+ */
+ saved_fs = get_fs();
+ set_fs(get_ds());
+
+ rc = user_path_dir(filename, &path);
+ if (rc)
+ goto out;
+
+ rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+ if (rc)
+ goto dput_and_out;
+
+ vn_set_fs_pwd(current->fs, &path);
+
+dput_and_out:
+ path_put(&path);
+out:
+ set_fs(saved_fs);
+
+ return (-rc);
+} /* vn_set_pwd() */
+EXPORT_SYMBOL(vn_set_pwd);
+
+static int
+vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct vnode *vp = buf;
+
+ mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+} /* vn_cache_constructor() */
+
+static void
+vn_cache_destructor(void *buf, void *cdrarg)
+{
+ struct vnode *vp = buf;
+
+ mutex_destroy(&vp->v_lock);
+} /* vn_cache_destructor() */
+
+static int
+vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ file_t *fp = buf;
+
+ atomic_set(&fp->f_ref, 0);
+ mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&fp->f_list);
+
+ return (0);
+} /* vn_file_cache_constructor() */
+
+static void
+vn_file_cache_destructor(void *buf, void *cdrarg)
+{
+ file_t *fp = buf;
+
+ mutex_destroy(&fp->f_lock);
+} /* vn_file_cache_destructor() */
+
+int
+spl_vn_init(void)
+{
+ vn_cache = kmem_cache_create("spl_vn_cache",
+ sizeof (struct vnode), 64, vn_cache_constructor,
+ vn_cache_destructor, NULL, NULL, NULL, 0);
+
+ vn_file_cache = kmem_cache_create("spl_vn_file_cache",
+ sizeof (file_t), 64, vn_file_cache_constructor,
+ vn_file_cache_destructor, NULL, NULL, NULL, 0);
+
+ return (0);
+} /* spl_vn_init() */
+
+void
+spl_vn_fini(void)
+{
+ file_t *fp, *next_fp;
+ int leaked = 0;
+
+ spin_lock(&vn_file_lock);
+
+ list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) {
+ list_del(&fp->f_list);
+ releasef_locked(fp);
+ leaked++;
+ }
+
+ spin_unlock(&vn_file_lock);
+
+ if (leaked > 0)
+ printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked);
+
+ kmem_cache_destroy(vn_file_cache);
+ kmem_cache_destroy(vn_cache);
+} /* spl_vn_fini() */
diff --git a/module/spl/spl-xdr.c b/module/spl/spl-xdr.c
new file mode 100644
index 000000000..2cc3e2a03
--- /dev/null
+++ b/module/spl/spl-xdr.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2008-2010 Sun Microsystems, Inc.
+ * Written by Ricardo Correia <[email protected]>
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) XDR Implementation.
+ */
+
+#include <linux/string.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <rpc/xdr.h>
+
+/*
+ * SPL's XDR mem implementation.
+ *
+ * This is used by libnvpair to serialize/deserialize the name-value pair data
+ * structures into byte arrays in a well-defined and portable manner.
+ *
+ * These data structures are used by the DMU/ZFS to flexibly manipulate various
+ * information in memory and later serialize it/deserialize it to disk.
+ * Examples of usages include the pool configuration, lists of pool and dataset
+ * properties, etc.
+ *
+ * Reference documentation for the XDR representation and XDR operations can be
+ * found in RFC 1832 and xdr(3), respectively.
+ *
+ * === Implementation shortcomings ===
+ *
+ * It is assumed that the following C types have the following sizes:
+ *
+ * char/unsigned char: 1 byte
+ * short/unsigned short: 2 bytes
+ * int/unsigned int: 4 bytes
+ * longlong_t/u_longlong_t: 8 bytes
+ *
+ * The C standard allows these types to be larger (and in the case of ints,
+ * shorter), so if that is the case on some compiler/architecture, the build
+ * will fail (on purpose).
+ *
+ * If someone wants to fix the code to work properly on such environments, then:
+ *
+ * 1) Preconditions should be added to xdrmem_enc functions to make sure the
+ * caller doesn't pass arguments which exceed the expected range.
+ * 2) Functions which take signed integers should be changed to properly do
+ * sign extension.
+ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger
+ * problems than this implementation.
+ *
+ * It is also assumed that:
+ *
+ * 1) Chars have 8 bits.
+ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned
+ * memcpy, memset and memcmp.
+ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture
+ * supports element-sized-aligned memory accesses.
+ * 4) Negative integers are natively stored in two's complement binary
+ * representation.
+ *
+ * No checks are done for the 4 assumptions above, though.
+ *
+ * === Caller expectations ===
+ *
+ * Existing documentation does not describe the semantics of XDR operations very
+ * well. Therefore, some assumptions about failure semantics will be made and
+ * will be described below:
+ *
+ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the
+ * the stream should be considered valid only up to the encoding operation
+ * previous to the one that first failed. However, the stream size as returned
+ * by xdr_control() cannot be considered to be strictly correct (it may be
+ * bigger).
+ *
+ * Putting it another way, if there is an encoding failure it's undefined
+ * whether anything is added to the stream in that operation and therefore
+ * neither xdr_control() nor future encoding operations on the same stream can
+ * be relied upon to produce correct results.
+ *
+ * 2) If a decoding operation fails, it's undefined whether anything will be
+ * decoded into passed buffers/pointers during that operation, or what the
+ * values on those buffers will look like.
+ *
+ * Future decoding operations on the same stream will also have similar
+ * undefined behavior.
+ *
+ * 3) When the first decoding operation fails it is OK to trust the results of
+ * previous decoding operations on the same stream, as long as the caller
+ * expects a failure to be possible (e.g. due to end-of-stream).
+ *
+ * However, this is highly discouraged because the caller should know the
+ * stream size and should be coded to expect any decoding failure to be data
+ * corruption due to hardware, accidental or even malicious causes, which should
+ * be handled gracefully in all cases.
+ *
+ * In very rare situations where there are strong reasons to believe the data
+ * can be trusted to be valid and non-tampered with, then the caller may assume
+ * a decoding failure to be a bug (e.g. due to mismatched data types) and may
+ * fail non-gracefully.
+ *
+ * 4) Non-zero padding bytes will cause the decoding operation to fail.
+ *
+ * 5) Zero bytes on string types will also cause the decoding operation to fail.
+ *
+ * 6) It is assumed that either the pointer to the stream buffer given by the
+ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int
+ * memory accesses.
+ *
+ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap.
+ *
+ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user
+ * space or MMIO space), the computer may explode.
+ */
+
+static struct xdr_ops xdrmem_encode_ops;
+static struct xdr_ops xdrmem_decode_ops;
+
+typedef int bool_t;
+
+void
+xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+ const enum xdr_op op)
+{
+ switch (op) {
+ case XDR_ENCODE:
+ xdrs->x_ops = &xdrmem_encode_ops;
+ break;
+ case XDR_DECODE:
+ xdrs->x_ops = &xdrmem_decode_ops;
+ break;
+ default:
+ xdrs->x_ops = NULL; /* Let the caller know we failed */
+ return;
+ }
+
+ xdrs->x_op = op;
+ xdrs->x_addr = addr;
+ xdrs->x_addr_end = addr + size;
+
+ if (xdrs->x_addr_end < xdrs->x_addr) {
+ xdrs->x_ops = NULL;
+ }
+}
+EXPORT_SYMBOL(xdrmem_create);
+
+static bool_t
+xdrmem_control(XDR *xdrs, int req, void *info)
+{
+ struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info;
+
+ if (req != XDR_GET_BYTES_AVAIL)
+ return (FALSE);
+
+ rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */
+ rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(xdrs->x_addr, cp, cnt);
+
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ memset(xdrs->x_addr, 0, pad);
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ static uint32_t zero = 0;
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(cp, xdrs->x_addr, cnt);
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ /* An inverted memchr() would be useful here... */
+ if (memcmp(&zero, xdrs->x_addr, pad) != 0)
+ return (FALSE);
+
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint32(XDR *xdrs, uint32_t val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val);
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_uint32(XDR *xdrs, uint32_t *val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr));
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+ val = *((unsigned char *) cp);
+
+ return (xdrmem_enc_uint32(xdrs, val));
+}
+
+static bool_t
+xdrmem_dec_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * If any of the 3 other bytes are non-zero then val will be greater
+ * than 0xff and we fail because according to the RFC, this block does
+ * not have a char encoded in it.
+ */
+ if (val > 0xff)
+ return (FALSE);
+
+ *((unsigned char *) cp) = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp)
+{
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ return (xdrmem_enc_uint32(xdrs, *usp));
+}
+
+static bool_t
+xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * Short ints are not in the RFC, but we assume similar logic as in
+ * xdrmem_dec_char().
+ */
+ if (val > 0xffff)
+ return (FALSE);
+
+ *usp = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_enc_uint32(xdrs, *up));
+}
+
+static bool_t
+xdrmem_dec_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_dec_uint32(xdrs, (uint32_t *)up));
+}
+
+static bool_t
+xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_enc_uint32(xdrs, *ullp >> 32))
+ return (FALSE);
+
+ return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff));
+}
+
+static bool_t
+xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ uint32_t low, high;
+
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_dec_uint32(xdrs, &high))
+ return (FALSE);
+ if (!xdrmem_dec_uint32(xdrs, &low))
+ return (FALSE);
+
+ *ullp = ((u_longlong_t)high << 32) | low;
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i;
+ caddr_t addr = *arrp;
+
+ if (*sizep > maxsize || *sizep > UINT_MAX / elsize)
+ return (FALSE);
+
+ if (!xdrmem_enc_uint(xdrs, sizep))
+ return (FALSE);
+
+ for (i = 0; i < *sizep; i++) {
+ if (!elproc(xdrs, addr))
+ return (FALSE);
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i, size;
+ bool_t alloc = FALSE;
+ caddr_t addr;
+
+ if (!xdrmem_dec_uint(xdrs, sizep))
+ return (FALSE);
+
+ size = *sizep;
+
+ if (size > maxsize || size > UINT_MAX / elsize)
+ return (FALSE);
+
+ /*
+ * The Solaris man page says: "If *arrp is NULL when decoding,
+ * xdr_array() allocates memory and *arrp points to it".
+ */
+ if (*arrp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *arrp = kmem_alloc(size * elsize, KM_NOSLEEP);
+ if (*arrp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ addr = *arrp;
+
+ for (i = 0; i < size; i++) {
+ if (!elproc(xdrs, addr)) {
+ if (alloc)
+ kmem_free(*arrp, size * elsize);
+ return (FALSE);
+ }
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ size_t slen = strlen(*sp);
+ uint_t len;
+
+ if (slen > maxsize)
+ return (FALSE);
+
+ len = slen;
+
+ if (!xdrmem_enc_uint(xdrs, &len))
+ return (FALSE);
+
+ return (xdrmem_enc_bytes(xdrs, *sp, len));
+}
+
+static bool_t
+xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ uint_t size;
+ bool_t alloc = FALSE;
+
+ if (!xdrmem_dec_uint(xdrs, &size))
+ return (FALSE);
+
+ if (size > maxsize || size > UINT_MAX - 1)
+ return (FALSE);
+
+ /*
+ * Solaris man page: "If *sp is NULL when decoding, xdr_string()
+ * allocates memory and *sp points to it".
+ */
+ if (*sp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *sp = kmem_alloc(size + 1, KM_NOSLEEP);
+ if (*sp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ if (!xdrmem_dec_bytes(xdrs, *sp, size))
+ goto fail;
+
+ if (memchr(*sp, 0, size) != NULL)
+ goto fail;
+
+ (*sp)[size] = '\0';
+
+ return (TRUE);
+
+fail:
+ if (alloc)
+ kmem_free(*sp, size + 1);
+
+ return (FALSE);
+}
+
+static struct xdr_ops xdrmem_encode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_enc_char,
+ .xdr_u_short = xdrmem_enc_ushort,
+ .xdr_u_int = xdrmem_enc_uint,
+ .xdr_u_longlong_t = xdrmem_enc_ulonglong,
+ .xdr_opaque = xdrmem_enc_bytes,
+ .xdr_string = xdr_enc_string,
+ .xdr_array = xdr_enc_array
+};
+
+static struct xdr_ops xdrmem_decode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_dec_char,
+ .xdr_u_short = xdrmem_dec_ushort,
+ .xdr_u_int = xdrmem_dec_uint,
+ .xdr_u_longlong_t = xdrmem_dec_ulonglong,
+ .xdr_opaque = xdrmem_dec_bytes,
+ .xdr_string = xdr_dec_string,
+ .xdr_array = xdr_dec_array
+};
diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c
new file mode 100644
index 000000000..229e6a44b
--- /dev/null
+++ b/module/spl/spl-zlib.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * z_compress_level/z_uncompress are nearly identical copies of the
+ * compress2/uncompress functions provided by the official zlib package
+ * available at http://zlib.net/. The only changes made we to slightly
+ * adapt the functions called to match the linux kernel implementation
+ * of zlib. The full zlib license follows:
+ *
+ * zlib.h -- interface of the 'zlib' general purpose compression library
+ * version 1.2.5, April 19th, 2010
+ *
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * Jean-loup Gailly
+ * Mark Adler
+ */
+
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+
+static spl_kmem_cache_t *zlib_workspace_cache;
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call. Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use. This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+ return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)));
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+ kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit. sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_deflateInit(&stream, level);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_compress_level);
+
+/*
+ * Decompresses the source buffer into the destination buffer. sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ if (err == Z_NEED_DICT ||
+ (err == Z_BUF_ERROR && stream.avail_in == 0))
+ return (Z_DATA_ERROR);
+
+ return (err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_uncompress);
+
+int
+spl_zlib_init(void)
+{
+ int size;
+
+ size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+
+ zlib_workspace_cache = kmem_cache_create(
+ "spl_zlib_workspace_cache",
+ size, 0, NULL, NULL, NULL, NULL, NULL,
+ KMC_VMEM | KMC_NOEMERGENCY);
+ if (!zlib_workspace_cache)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+ kmem_cache_destroy(zlib_workspace_cache);
+ zlib_workspace_cache = NULL;
+}