aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config/kernel-ctl-table-name.m418
-rw-r--r--config/kernel-fallocate-pax.m419
-rw-r--r--config/kernel-group-info.m421
-rw-r--r--config/kernel-inode-lock.m423
-rw-r--r--config/kernel-kmem-cache.m472
-rw-r--r--config/kernel-kmem.m458
-rw-r--r--config/kernel-kuidgid.m428
-rw-r--r--config/kernel-pde-data.m417
-rw-r--r--config/kernel-rw.m457
-rw-r--r--config/kernel-rwsem.m475
-rw-r--r--config/kernel-sched.m456
-rw-r--r--config/kernel-set-fs-pwd.m439
-rw-r--r--config/kernel-shrinker.m4125
-rw-r--r--config/kernel-spinlock.m424
-rw-r--r--config/kernel-timer.m432
-rw-r--r--config/kernel-trim-unused-symbols.m419
-rw-r--r--config/kernel-urange-sleep.m421
-rw-r--r--config/kernel-vfs-fsync.m417
-rw-r--r--config/kernel-vfs-getattr.m462
-rw-r--r--config/kernel-wait.m476
-rw-r--r--config/kernel-zlib.m463
-rw-r--r--include/spl/rpc/xdr.h156
-rw-r--r--include/spl/sys/acl.h119
-rw-r--r--include/spl/sys/atomic.h79
-rw-r--r--include/spl/sys/byteorder.h78
-rw-r--r--include/spl/sys/callb.h54
-rw-r--r--include/spl/sys/callo.h52
-rw-r--r--include/spl/sys/cmn_err.h42
-rw-r--r--include/spl/sys/condvar.h80
-rw-r--r--include/spl/sys/console.h44
-rw-r--r--include/spl/sys/cred.h75
-rw-r--r--include/spl/sys/ctype.h30
-rw-r--r--include/spl/sys/debug.h131
-rw-r--r--include/spl/sys/disp.h34
-rw-r--r--include/spl/sys/dkio.h40
-rw-r--r--include/spl/sys/dkioc_free_util.h58
-rw-r--r--include/spl/sys/fcntl.h37
-rw-r--r--include/spl/sys/file.h52
-rw-r--r--include/spl/sys/inttypes.h28
-rw-r--r--include/spl/sys/isa_defs.h229
-rw-r--r--include/spl/sys/kmem.h185
-rw-r--r--include/spl/sys/kmem_cache.h240
-rw-r--r--include/spl/sys/kobj.h42
-rw-r--r--include/spl/sys/kstat.h208
-rw-r--r--include/spl/sys/list.h208
-rw-r--r--include/spl/sys/mode.h32
-rw-r--r--include/spl/sys/mutex.h184
-rw-r--r--include/spl/sys/param.h36
-rw-r--r--include/spl/sys/proc.h35
-rw-r--r--include/spl/sys/processor.h32
-rw-r--r--include/spl/sys/random.h40
-rw-r--r--include/spl/sys/rwlock.h273
-rw-r--r--include/spl/sys/shrinker.h209
-rw-r--r--include/spl/sys/sid.h61
-rw-r--r--include/spl/sys/signal.h55
-rw-r--r--include/spl/sys/stat.h30
-rw-r--r--include/spl/sys/strings.h31
-rw-r--r--include/spl/sys/sunddi.h58
-rw-r--r--include/spl/sys/sysmacros.h228
-rw-r--r--include/spl/sys/systeminfo.h36
-rw-r--r--include/spl/sys/taskq.h163
-rw-r--r--include/spl/sys/thread.h69
-rw-r--r--include/spl/sys/time.h82
-rw-r--r--include/spl/sys/timer.h75
-rw-r--r--include/spl/sys/tsd.h46
-rw-r--r--include/spl/sys/types.h70
-rw-r--r--include/spl/sys/types32.h35
-rw-r--r--include/spl/sys/uio.h106
-rw-r--r--include/spl/sys/user.h42
-rw-r--r--include/spl/sys/vfs.h51
-rw-r--r--include/spl/sys/vmem.h109
-rw-r--r--include/spl/sys/vmsystm.h84
-rw-r--r--include/spl/sys/vnode.h204
-rw-r--r--include/spl/sys/wait.h55
-rw-r--r--include/spl/sys/zmod.h78
-rw-r--r--include/spl/sys/zone.h36
-rw-r--r--man/man5/spl-module-parameters.5357
-rw-r--r--module/spl/THIRDPARTYLICENSE.gplv2339
-rw-r--r--module/spl/THIRDPARTYLICENSE.gplv2.descrip1
-rw-r--r--module/spl/spl-atomic.c36
-rw-r--r--module/spl/spl-condvar.c410
-rw-r--r--module/spl/spl-cred.c200
-rw-r--r--module/spl/spl-err.c133
-rw-r--r--module/spl/spl-generic.c775
-rw-r--r--module/spl/spl-kmem-cache.c1769
-rw-r--r--module/spl/spl-kmem.c567
-rw-r--r--module/spl/spl-kobj.c86
-rw-r--r--module/spl/spl-kstat.c733
-rw-r--r--module/spl/spl-mutex.c30
-rw-r--r--module/spl/spl-proc.c782
-rw-r--r--module/spl/spl-rwlock.c114
-rw-r--r--module/spl/spl-taskq.c1305
-rw-r--r--module/spl/spl-thread.c160
-rw-r--r--module/spl/spl-tsd.c720
-rw-r--r--module/spl/spl-vmem.c135
-rw-r--r--module/spl/spl-vnode.c779
-rw-r--r--module/spl/spl-xdr.c515
-rw-r--r--module/spl/spl-zlib.c217
98 files changed, 16031 insertions, 0 deletions
diff --git a/config/kernel-ctl-table-name.m4 b/config/kernel-ctl-table-name.m4
new file mode 100644
index 000000000..8dd2e77cb
--- /dev/null
+++ b/config/kernel-ctl-table-name.m4
@@ -0,0 +1,18 @@
+dnl #
+dnl # 2.6.33 API change,
+dnl # Removed .ctl_name from struct ctl_table.
+dnl #
+AC_DEFUN([SPL_AC_CTL_NAME], [
+ AC_MSG_CHECKING([whether struct ctl_table has ctl_name])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sysctl.h>
+ ],[
+ struct ctl_table ctl __attribute__ ((unused));
+ ctl.ctl_name = 0;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_CTL_NAME, 1, [struct ctl_table has ctl_name])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-fallocate-pax.m4 b/config/kernel-fallocate-pax.m4
new file mode 100644
index 000000000..ac75a4c8e
--- /dev/null
+++ b/config/kernel-fallocate-pax.m4
@@ -0,0 +1,19 @@
+dnl #
+dnl # PaX Linux 2.6.38 - 3.x API
+dnl #
+AC_DEFUN([SPL_AC_PAX_KERNEL_FILE_FALLOCATE], [
+ AC_MSG_CHECKING([whether fops->fallocate() exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ long (*fallocate) (struct file *, int, loff_t, loff_t) = NULL;
+ struct file_operations_no_const fops __attribute__ ((unused)) = {
+ .fallocate = fallocate,
+ };
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-group-info.m4 b/config/kernel-group-info.m4
new file mode 100644
index 000000000..4db2bba5c
--- /dev/null
+++ b/config/kernel-group-info.m4
@@ -0,0 +1,21 @@
+dnl #
+dnl # 4.9 API change
+dnl # group_info changed from 2d array via >blocks to 1d array via ->gid
+dnl #
+AC_DEFUN([SPL_AC_GROUP_INFO_GID], [
+ AC_MSG_CHECKING([whether group_info->gid exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/cred.h>
+ ],[
+ struct group_info *gi = groups_alloc(1);
+ gi->gid[0] = KGIDT_INIT(0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_GROUP_INFO_GID, 1, [group_info->gid exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-inode-lock.m4 b/config/kernel-inode-lock.m4
new file mode 100644
index 000000000..2cc06a5ec
--- /dev/null
+++ b/config/kernel-inode-lock.m4
@@ -0,0 +1,23 @@
+dnl #
+dnl # 4.7 API change
+dnl # i_mutex is changed to i_rwsem. Instead of directly using
+dnl # i_mutex/i_rwsem, we should use inode_lock() and inode_lock_shared()
+dnl # We test inode_lock_shared because inode_lock is introduced earlier.
+dnl #
+AC_DEFUN([SPL_AC_INODE_LOCK], [
+ AC_MSG_CHECKING([whether inode_lock_shared() exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ struct inode *inode = NULL;
+ inode_lock_shared(inode);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_INODE_LOCK_SHARED, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-kmem-cache.m4 b/config/kernel-kmem-cache.m4
new file mode 100644
index 000000000..50a7fdb4b
--- /dev/null
+++ b/config/kernel-kmem-cache.m4
@@ -0,0 +1,72 @@
+dnl #
+dnl # 2.6.35 API change,
+dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are
+dnl # private allocation flags which are applied when allocating a new slab
+dnl # in kmem_getpages(). Unfortunately there is no public API for setting
+dnl # non-default flags.
+dnl #
+AC_DEFUN([SPL_AC_KMEM_CACHE_ALLOCFLAGS], [
+ AC_MSG_CHECKING([whether struct kmem_cache has allocflags])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/slab.h>
+ ],[
+ struct kmem_cache cachep __attribute__ ((unused));
+ cachep.allocflags = GFP_KERNEL;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1,
+ [struct kmem_cache has allocflags])
+ ],[
+ AC_MSG_RESULT(no)
+
+ AC_MSG_CHECKING([whether struct kmem_cache has gfpflags])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/slab.h>
+ ],[
+ struct kmem_cache cachep __attribute__ ((unused));
+ cachep.gfpflags = GFP_KERNEL;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1,
+ [struct kmem_cache has gfpflags])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ ])
+])
+
+dnl #
+dnl # grsecurity API change,
+dnl # kmem_cache_create() with SLAB_USERCOPY flag replaced by
+dnl # kmem_cache_create_usercopy().
+dnl #
+AC_DEFUN([SPL_AC_KMEM_CACHE_CREATE_USERCOPY], [
+ AC_MSG_CHECKING([whether kmem_cache_create_usercopy() exists])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/slab.h>
+ static void ctor(void *foo)
+ {
+ // fake ctor
+ }
+ ],[
+ struct kmem_cache *skc_linux_cache;
+ const char *name = "test";
+ size_t size = 4096;
+ size_t align = 8;
+ unsigned long flags = 0;
+ size_t useroffset = 0;
+ size_t usersize = size - useroffset;
+
+ skc_linux_cache = kmem_cache_create_usercopy(
+ name, size, align, flags, useroffset, usersize, ctor);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KMEM_CACHE_CREATE_USERCOPY, 1,
+ [kmem_cache_create_usercopy() exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-kmem.m4 b/config/kernel-kmem.m4
new file mode 100644
index 000000000..cc055e530
--- /dev/null
+++ b/config/kernel-kmem.m4
@@ -0,0 +1,58 @@
+dnl #
+dnl # Enabled by default it provides a minimal level of memory tracking.
+dnl # A total count of bytes allocated is kept for each alloc and free.
+dnl # Then at module unload time a report to the console will be printed
+dnl # if memory was leaked.
+dnl #
+AC_DEFUN([SPL_AC_DEBUG_KMEM], [
+ AC_ARG_ENABLE([debug-kmem],
+ [AS_HELP_STRING([--enable-debug-kmem],
+ [Enable basic kmem accounting @<:@default=no@:>@])],
+ [],
+ [enable_debug_kmem=no])
+
+ AS_IF([test "x$enable_debug_kmem" = xyes],
+ [
+ KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM"
+ DEBUG_KMEM="_with_debug_kmem"
+ AC_DEFINE([DEBUG_KMEM], [1],
+ [Define to 1 to enable basic kmem accounting])
+ ], [
+ DEBUG_KMEM="_without_debug_kmem"
+ ])
+
+ AC_SUBST(DEBUG_KMEM)
+ AC_MSG_CHECKING([whether basic kmem accounting is enabled])
+ AC_MSG_RESULT([$enable_debug_kmem])
+])
+
+dnl #
+dnl # Disabled by default it provides detailed memory tracking. This
+dnl # feature also requires --enable-debug-kmem to be set. When enabled
+dnl # not only will total bytes be tracked but also the location of every
+dnl # alloc and free. When the SPL module is unloaded a list of all leaked
+dnl # addresses and where they were allocated will be dumped to the console.
+dnl # Enabling this feature has a significant impact on performance but it
+dnl # makes finding memory leaks pretty straight forward.
+dnl #
+AC_DEFUN([SPL_AC_DEBUG_KMEM_TRACKING], [
+ AC_ARG_ENABLE([debug-kmem-tracking],
+ [AS_HELP_STRING([--enable-debug-kmem-tracking],
+ [Enable detailed kmem tracking @<:@default=no@:>@])],
+ [],
+ [enable_debug_kmem_tracking=no])
+
+ AS_IF([test "x$enable_debug_kmem_tracking" = xyes],
+ [
+ KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM_TRACKING"
+ DEBUG_KMEM_TRACKING="_with_debug_kmem_tracking"
+ AC_DEFINE([DEBUG_KMEM_TRACKING], [1],
+ [Define to 1 to enable detailed kmem tracking])
+ ], [
+ DEBUG_KMEM_TRACKING="_without_debug_kmem_tracking"
+ ])
+
+ AC_SUBST(DEBUG_KMEM_TRACKING)
+ AC_MSG_CHECKING([whether detailed kmem tracking is enabled])
+ AC_MSG_RESULT([$enable_debug_kmem_tracking])
+])
diff --git a/config/kernel-kuidgid.m4 b/config/kernel-kuidgid.m4
new file mode 100644
index 000000000..47d193783
--- /dev/null
+++ b/config/kernel-kuidgid.m4
@@ -0,0 +1,28 @@
+dnl #
+dnl # User namespaces, use kuid_t in place of uid_t
+dnl # where available. Not strictly a user namespaces thing
+dnl # but it should prevent surprises
+dnl #
+AC_DEFUN([SPL_AC_KUIDGID_T], [
+ AC_MSG_CHECKING([whether kuid_t/kgid_t is available])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/uidgid.h>
+ ], [
+ kuid_t userid = KUIDT_INIT(0);
+ kgid_t groupid = KGIDT_INIT(0);
+ ],[
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/uidgid.h>
+ ], [
+ kuid_t userid = 0;
+ kgid_t groupid = 0;
+ ],[
+ AC_MSG_RESULT(yes; optional)
+ ],[
+ AC_MSG_RESULT(yes; mandatory)
+ AC_DEFINE(HAVE_KUIDGID_T, 1, [kuid_t/kgid_t in use])
+ ])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-pde-data.m4 b/config/kernel-pde-data.m4
new file mode 100644
index 000000000..6aa5765c3
--- /dev/null
+++ b/config/kernel-pde-data.m4
@@ -0,0 +1,17 @@
+dnl #
+dnl # 3.10 API change,
+dnl # PDE is replaced by PDE_DATA
+dnl #
+AC_DEFUN([SPL_AC_PDE_DATA], [
+ AC_MSG_CHECKING([whether PDE_DATA() is available])
+ SPL_LINUX_TRY_COMPILE_SYMBOL([
+ #include <linux/proc_fs.h>
+ ], [
+ PDE_DATA(NULL);
+ ], [PDE_DATA], [], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_PDE_DATA, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-rw.m4 b/config/kernel-rw.m4
new file mode 100644
index 000000000..23c14b70f
--- /dev/null
+++ b/config/kernel-rw.m4
@@ -0,0 +1,57 @@
+dnl #
+dnl # 4.14 API change
+dnl # kernel_write() which was introduced in 3.9 was updated to take
+dnl # the offset as a pointer which is needed by vn_rdwr().
+dnl #
+AC_DEFUN([SPL_AC_KERNEL_WRITE], [
+ AC_MSG_CHECKING([whether kernel_write() takes loff_t pointer])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ struct file *file = NULL;
+ const void *buf = NULL;
+ size_t count = 0;
+ loff_t *pos = NULL;
+ ssize_t ret;
+
+ ret = kernel_write(file, buf, count, pos);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KERNEL_WRITE_PPOS, 1,
+ [kernel_write() take loff_t pointer])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 4.14 API change
+dnl # kernel_read() which has existed for forever was updated to take
+dnl # the offset as a pointer which is needed by vn_rdwr().
+dnl #
+AC_DEFUN([SPL_AC_KERNEL_READ], [
+ AC_MSG_CHECKING([whether kernel_read() takes loff_t pointer])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ struct file *file = NULL;
+ void *buf = NULL;
+ size_t count = 0;
+ loff_t *pos = NULL;
+ ssize_t ret;
+
+ ret = kernel_read(file, buf, count, pos);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KERNEL_READ_PPOS, 1,
+ [kernel_read() take loff_t pointer])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-rwsem.m4 b/config/kernel-rwsem.m4
new file mode 100644
index 000000000..aee20ae90
--- /dev/null
+++ b/config/kernel-rwsem.m4
@@ -0,0 +1,75 @@
+dnl #
+dnl # 3.1 API Change
+dnl #
+dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to
+dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1.
+dnl #
+AC_DEFUN([SPL_AC_RWSEM_SPINLOCK_IS_RAW], [
+ AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/rwsem.h>
+ ],[
+ struct rw_semaphore dummy_semaphore __attribute__ ((unused));
+ raw_spinlock_t dummy_lock __attribute__ ((unused)) =
+ __RAW_SPIN_LOCK_INITIALIZER(dummy_lock);
+ dummy_semaphore.wait_lock = dummy_lock;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(RWSEM_SPINLOCK_IS_RAW, 1,
+ [struct rw_semaphore member wait_lock is raw_spinlock_t])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 3.16 API Change
+dnl #
+dnl # rwsem-spinlock "->activity" changed to "->count"
+dnl #
+AC_DEFUN([SPL_AC_RWSEM_ACTIVITY], [
+ AC_MSG_CHECKING([whether struct rw_semaphore has member activity])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/rwsem.h>
+ ],[
+ struct rw_semaphore dummy_semaphore __attribute__ ((unused));
+ dummy_semaphore.activity = 0;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1,
+ [struct rw_semaphore has member activity])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 4.8 API Change
+dnl #
+dnl # rwsem "->count" changed to atomic_long_t type
+dnl #
+AC_DEFUN([SPL_AC_RWSEM_ATOMIC_LONG_COUNT], [
+ AC_MSG_CHECKING(
+ [whether struct rw_semaphore has atomic_long_t member count])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/rwsem.h>
+ ],[
+ DECLARE_RWSEM(dummy_semaphore);
+ (void) atomic_long_read(&dummy_semaphore.count);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_RWSEM_ATOMIC_LONG_COUNT, 1,
+ [struct rw_semaphore has atomic_long_t member count])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-sched.m4 b/config/kernel-sched.m4
new file mode 100644
index 000000000..5ae21676e
--- /dev/null
+++ b/config/kernel-sched.m4
@@ -0,0 +1,56 @@
+dnl #
+dnl # 3.9 API change,
+dnl # Moved things from linux/sched.h to linux/sched/rt.h
+dnl #
+AC_DEFUN([SPL_AC_SCHED_RT_HEADER],
+ [AC_MSG_CHECKING([whether header linux/sched/rt.h exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sched.h>
+ #include <linux/sched/rt.h>
+ ],[
+ return 0;
+ ],[
+ AC_DEFINE(HAVE_SCHED_RT_HEADER, 1, [linux/sched/rt.h exists])
+ AC_MSG_RESULT(yes)
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # 4.11 API change,
+dnl # Moved things from linux/sched.h to linux/sched/signal.h
+dnl #
+AC_DEFUN([SPL_AC_SCHED_SIGNAL_HEADER],
+ [AC_MSG_CHECKING([whether header linux/sched/signal.h exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sched.h>
+ #include <linux/sched/signal.h>
+ ],[
+ return 0;
+ ],[
+ AC_DEFINE(HAVE_SCHED_SIGNAL_HEADER, 1, [linux/sched/signal.h exists])
+ AC_MSG_RESULT(yes)
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+dnl #
+dnl # 3.19 API change
+dnl # The io_schedule_timeout() function is present in all 2.6.32 kernels
+dnl # but it was not exported until Linux 3.19. The RHEL 7.x kernels which
+dnl # are based on a 3.10 kernel do export this symbol.
+dnl #
+AC_DEFUN([SPL_AC_IO_SCHEDULE_TIMEOUT], [
+ AC_MSG_CHECKING([whether io_schedule_timeout() is available])
+ SPL_LINUX_TRY_COMPILE_SYMBOL([
+ #include <linux/sched.h>
+ ], [
+ (void) io_schedule_timeout(1);
+ ], [io_schedule_timeout], [], [
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_IO_SCHEDULE_TIMEOUT, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-set-fs-pwd.m4 b/config/kernel-set-fs-pwd.m4
new file mode 100644
index 000000000..849e7e6cb
--- /dev/null
+++ b/config/kernel-set-fs-pwd.m4
@@ -0,0 +1,39 @@
+dnl #
+dnl # 3.9 API change
+dnl # set_fs_pwd takes const struct path *
+dnl #
+AC_DEFUN([SPL_AC_SET_FS_PWD_WITH_CONST],
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ [AC_MSG_CHECKING([whether set_fs_pwd() requires const struct path *])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/spinlock.h>
+ #include <linux/fs_struct.h>
+ #include <linux/path.h>
+ void (*const set_fs_pwd_func)
+ (struct fs_struct *, const struct path *)
+ = set_fs_pwd;
+ ],[
+ return 0;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SET_FS_PWD_WITH_CONST, 1,
+ [set_fs_pwd() needs const path *])
+ ],[
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/spinlock.h>
+ #include <linux/fs_struct.h>
+ #include <linux/path.h>
+ void (*const set_fs_pwd_func)
+ (struct fs_struct *, struct path *)
+ = set_fs_pwd;
+ ],[
+ return 0;
+ ],[
+ AC_MSG_RESULT(no)
+ ],[
+ AC_MSG_ERROR(unknown)
+ ])
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-shrinker.m4 b/config/kernel-shrinker.m4
new file mode 100644
index 000000000..6fc9b5422
--- /dev/null
+++ b/config/kernel-shrinker.m4
@@ -0,0 +1,125 @@
+AC_DEFUN([SPL_AC_SHRINKER_CALLBACK],[
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ dnl #
+ dnl # 2.6.23 to 2.6.34 API change
+ dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask)
+ dnl #
+ AC_MSG_CHECKING([whether old 2-argument shrinker exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ int shrinker_cb(int nr_to_scan, gfp_t gfp_mask);
+ ],[
+ struct shrinker cache_shrinker = {
+ .shrink = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1,
+ [old shrinker callback wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ dnl #
+ dnl # 2.6.35 - 2.6.39 API change
+ dnl # ->shrink(struct shrinker *,
+ dnl # int nr_to_scan, gfp_t gfp_mask)
+ dnl #
+ AC_MSG_CHECKING([whether old 3-argument shrinker exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ int shrinker_cb(struct shrinker *, int nr_to_scan,
+ gfp_t gfp_mask);
+ ],[
+ struct shrinker cache_shrinker = {
+ .shrink = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1,
+ [old shrinker callback wants 3 args])
+ ],[
+ AC_MSG_RESULT(no)
+ dnl #
+ dnl # 3.0 - 3.11 API change
+ dnl # ->shrink(struct shrinker *,
+ dnl # struct shrink_control *sc)
+ dnl #
+ AC_MSG_CHECKING(
+ [whether new 2-argument shrinker exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ int shrinker_cb(struct shrinker *,
+ struct shrink_control *sc);
+ ],[
+ struct shrinker cache_shrinker = {
+ .shrink = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1,
+ [new shrinker callback wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ dnl #
+ dnl # 3.12 API change,
+ dnl # ->shrink() is logically split in to
+ dnl # ->count_objects() and ->scan_objects()
+ dnl #
+ AC_MSG_CHECKING(
+ [whether ->count_objects callback exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+
+ unsigned long shrinker_cb(
+ struct shrinker *,
+ struct shrink_control *sc);
+ ],[
+ struct shrinker cache_shrinker = {
+ .count_objects = shrinker_cb,
+ .scan_objects = shrinker_cb,
+ .seeks = DEFAULT_SEEKS,
+ };
+ register_shrinker(&cache_shrinker);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK,
+ 1, [->count_objects exists])
+ ],[
+ AC_MSG_ERROR(error)
+ ])
+ ])
+ ])
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
+
+dnl #
+dnl # 2.6.39 API change,
+dnl # Shrinker adjust to use common shrink_control structure.
+dnl #
+AC_DEFUN([SPL_AC_SHRINK_CONTROL_STRUCT], [
+ AC_MSG_CHECKING([whether struct shrink_control exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/mm.h>
+ ],[
+ struct shrink_control sc __attribute__ ((unused));
+
+ sc.nr_to_scan = 0;
+ sc.gfp_mask = GFP_KERNEL;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1,
+ [struct shrink_control exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-spinlock.m4 b/config/kernel-spinlock.m4
new file mode 100644
index 000000000..136262d0e
--- /dev/null
+++ b/config/kernel-spinlock.m4
@@ -0,0 +1,24 @@
+dnl #
+dnl # 2.6.36 API change,
+dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to
+dnl # a spinlock_t to improve the fastpath performance.
+dnl #
+AC_DEFUN([SPL_AC_FS_STRUCT_SPINLOCK], [
+ AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/sched.h>
+ #include <linux/fs_struct.h>
+ ],[
+ static struct fs_struct fs;
+ spin_lock_init(&fs.lock);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1,
+ [struct fs_struct uses spinlock_t])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4
new file mode 100644
index 000000000..93b5158b9
--- /dev/null
+++ b/config/kernel-timer.m4
@@ -0,0 +1,32 @@
+dnl #
+dnl # 4.15 API change
+dnl # https://lkml.org/lkml/2017/11/25/90
+dnl # Check if timer_list.func get passed a timer_list or an unsigned long
+dnl # (older kernels). Also sanity check the from_timer() and timer_setup()
+dnl # macros are available as well, since they will be used in the same newer
+dnl # kernels that support the new timer_list.func signature.
+dnl #
+AC_DEFUN([SPL_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [
+ AC_MSG_CHECKING([whether timer_list.function gets a timer_list])
+ tmp_flags="$EXTRA_KCFLAGS"
+ EXTRA_KCFLAGS="-Werror"
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/timer.h>
+ void task_expire(struct timer_list *tl) {}
+ ],[
+ #ifndef from_timer
+ #error "No from_timer() macro"
+ #endif
+
+ struct timer_list timer;
+ timer.function = task_expire;
+ timer_setup(&timer, NULL, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1,
+ [timer_list.function gets a timer_list])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+ EXTRA_KCFLAGS="$tmp_flags"
+])
diff --git a/config/kernel-trim-unused-symbols.m4 b/config/kernel-trim-unused-symbols.m4
new file mode 100644
index 000000000..d1ac2f3c8
--- /dev/null
+++ b/config/kernel-trim-unused-symbols.m4
@@ -0,0 +1,19 @@
+dnl #
+dnl # config trim unused symbols,
+dnl # Verify the kernel has CONFIG_TRIM_UNUSED_KSYMS DISABLED.
+dnl #
+AC_DEFUN([SPL_AC_CONFIG_TRIM_UNUSED_KSYMS], [
+ AC_MSG_CHECKING([whether CONFIG_TRIM_UNUSED_KSYM is disabled])
+ SPL_LINUX_TRY_COMPILE([
+ #if defined(CONFIG_TRIM_UNUSED_KSYMS)
+ #error CONFIG_TRIM_UNUSED_KSYMS not defined
+ #endif
+ ],[ ],[
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([
+ *** This kernel has unused symbols trimming enabled, please disable.
+ *** Rebuild the kernel with CONFIG_TRIM_UNUSED_KSYMS=n set.])
+ ])
+])
diff --git a/config/kernel-urange-sleep.m4 b/config/kernel-urange-sleep.m4
new file mode 100644
index 000000000..85beca6dd
--- /dev/null
+++ b/config/kernel-urange-sleep.m4
@@ -0,0 +1,21 @@
+dnl #
+dnl # 2.6.36 API compatibility.
+dnl # Added usleep_range timer.
+dnl # usleep_range is a finer precision implementation of msleep
+dnl # designed to be a drop-in replacement for udelay where a precise
+dnl # sleep / busy-wait is unnecessary.
+dnl #
+AC_DEFUN([SPL_AC_USLEEP_RANGE], [
+ AC_MSG_CHECKING([whether usleep_range() is available])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/delay.h>
+ ],[
+ usleep_range(0, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_USLEEP_RANGE, 1,
+ [usleep_range is available])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-vfs-fsync.m4 b/config/kernel-vfs-fsync.m4
new file mode 100644
index 000000000..3c42bf1a0
--- /dev/null
+++ b/config/kernel-vfs-fsync.m4
@@ -0,0 +1,17 @@
+dnl #
+dnl # 2.6.35 API change,
+dnl # Unused 'struct dentry *' removed from vfs_fsync() prototype.
+dnl #
+AC_DEFUN([SPL_AC_2ARGS_VFS_FSYNC], [
+ AC_MSG_CHECKING([whether vfs_fsync() wants 2 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_fsync(NULL, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_VFS_FSYNC, 1, [vfs_fsync() wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-vfs-getattr.m4 b/config/kernel-vfs-getattr.m4
new file mode 100644
index 000000000..7772cb514
--- /dev/null
+++ b/config/kernel-vfs-getattr.m4
@@ -0,0 +1,62 @@
+dnl #
+dnl # 4.11 API, a528d35e@torvalds/linux
+dnl # vfs_getattr(const struct path *p, struct kstat *s, u32 m, unsigned int f)
+dnl #
+AC_DEFUN([SPL_AC_4ARGS_VFS_GETATTR], [
+ AC_MSG_CHECKING([whether vfs_getattr() wants 4 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_getattr((const struct path *)NULL,
+ (struct kstat *)NULL,
+ (u32)0,
+ (unsigned int)0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_4ARGS_VFS_GETATTR, 1,
+ [vfs_getattr wants 4 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # 3.9 API
+dnl # vfs_getattr(struct path *p, struct kstat *s)
+dnl #
+AC_DEFUN([SPL_AC_2ARGS_VFS_GETATTR], [
+ AC_MSG_CHECKING([whether vfs_getattr() wants 2 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_getattr((struct path *) NULL,
+ (struct kstat *)NULL);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_VFS_GETATTR, 1,
+ [vfs_getattr wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # <3.9 API
+dnl # vfs_getattr(struct vfsmount *v, struct dentry *d, struct kstat *k)
+dnl #
+AC_DEFUN([SPL_AC_3ARGS_VFS_GETATTR], [
+ AC_MSG_CHECKING([whether vfs_getattr() wants 3 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/fs.h>
+ ],[
+ vfs_getattr((struct vfsmount *)NULL,
+ (struct dentry *)NULL,
+ (struct kstat *)NULL);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_3ARGS_VFS_GETATTR, 1,
+ [vfs_getattr wants 3 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-wait.m4 b/config/kernel-wait.m4
new file mode 100644
index 000000000..5f718a160
--- /dev/null
+++ b/config/kernel-wait.m4
@@ -0,0 +1,76 @@
+dnl #
+dnl # 3.17 API change,
+dnl # wait_on_bit() no longer requires an action argument. The former
+dnl # "wait_on_bit" interface required an 'action' function to be provided
+dnl # which does the actual waiting. There were over 20 such functions in the
+dnl # kernel, many of them identical, though most cases can be satisfied by one
+dnl # of just two functions: one which uses io_schedule() and one which just
+dnl # uses schedule(). This API change was made to consolidate all of those
+dnl # redundant wait functions.
+dnl #
+AC_DEFUN([SPL_AC_WAIT_ON_BIT], [
+ AC_MSG_CHECKING([whether wait_on_bit() takes an action])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/wait.h>
+ ],[
+ int (*action)(void *) = NULL;
+ wait_on_bit(NULL, 0, action, 0);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+dnl #
+dnl # 4.13 API change
+dnl # Renamed struct wait_queue -> struct wait_queue_entry.
+dnl #
+AC_DEFUN([SPL_AC_WAIT_QUEUE_ENTRY_T], [
+ AC_MSG_CHECKING([whether wait_queue_entry_t exists])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/wait.h>
+ ],[
+ wait_queue_entry_t *entry __attribute__ ((unused));
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_WAIT_QUEUE_ENTRY_T, 1,
+ [wait_queue_entry_t exists])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
+
+dnl #
+dnl # 4.13 API change
+dnl # Renamed wait_queue_head::task_list -> wait_queue_head::head
+dnl # Renamed wait_queue_entry::task_list -> wait_queue_entry::entry
+dnl #
+AC_DEFUN([SPL_AC_WAIT_QUEUE_HEAD_ENTRY], [
+ AC_MSG_CHECKING([whether wq_head->head and wq_entry->entry exist])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/wait.h>
+
+ #ifdef HAVE_WAIT_QUEUE_ENTRY_T
+ typedef wait_queue_head_t spl_wait_queue_head_t;
+ typedef wait_queue_entry_t spl_wait_queue_entry_t;
+ #else
+ typedef wait_queue_head_t spl_wait_queue_head_t;
+ typedef wait_queue_t spl_wait_queue_entry_t;
+ #endif
+ ],[
+ spl_wait_queue_head_t wq_head;
+ spl_wait_queue_entry_t wq_entry;
+ struct list_head *head __attribute__ ((unused));
+ struct list_head *entry __attribute__ ((unused));
+
+ head = &wq_head.head;
+ entry = &wq_entry.entry;
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_WAIT_QUEUE_HEAD_ENTRY, 1,
+ [wq_head->head and wq_entry->entry exist])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/config/kernel-zlib.m4 b/config/kernel-zlib.m4
new file mode 100644
index 000000000..bb236466a
--- /dev/null
+++ b/config/kernel-zlib.m4
@@ -0,0 +1,63 @@
+dnl #
+dnl # zlib inflate compat,
+dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled.
+dnl #
+AC_DEFUN([SPL_AC_CONFIG_ZLIB_INFLATE], [
+ AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined])
+ SPL_LINUX_TRY_COMPILE([
+ #if !defined(CONFIG_ZLIB_INFLATE) && \
+ !defined(CONFIG_ZLIB_INFLATE_MODULE)
+ #error CONFIG_ZLIB_INFLATE not defined
+ #endif
+ ],[ ],[
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([
+ *** This kernel does not include the required zlib inflate support.
+ *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.])
+ ])
+])
+
+dnl #
+dnl # zlib deflate compat,
+dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled.
+dnl #
+AC_DEFUN([SPL_AC_CONFIG_ZLIB_DEFLATE], [
+ AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined])
+ SPL_LINUX_TRY_COMPILE([
+ #if !defined(CONFIG_ZLIB_DEFLATE) && \
+ !defined(CONFIG_ZLIB_DEFLATE_MODULE)
+ #error CONFIG_ZLIB_DEFLATE not defined
+ #endif
+ ],[ ],[
+ AC_MSG_RESULT([yes])
+ ],[
+ AC_MSG_RESULT([no])
+ AC_MSG_ERROR([
+ *** This kernel does not include the required zlib deflate support.
+ *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.])
+ ])
+])
+
+dnl #
+dnl # 2.6.39 API compat,
+dnl # The function zlib_deflate_workspacesize() now take 2 arguments.
+dnl # This was done to avoid always having to allocate the maximum size
+dnl # workspace (268K). The caller can now specific the windowBits and
+dnl # memLevel compression parameters to get a smaller workspace.
+dnl #
+AC_DEFUN([SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE],
+ [AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args])
+ SPL_LINUX_TRY_COMPILE([
+ #include <linux/zlib.h>
+ ],[
+ return zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL);
+ ],[
+ AC_MSG_RESULT(yes)
+ AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1,
+ [zlib_deflate_workspacesize() wants 2 args])
+ ],[
+ AC_MSG_RESULT(no)
+ ])
+])
diff --git a/include/spl/rpc/xdr.h b/include/spl/rpc/xdr.h
new file mode 100644
index 000000000..0b39b46cf
--- /dev/null
+++ b/include/spl/rpc/xdr.h
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2008 Sun Microsystems, Inc.
+ * Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RPC_XDR_H
+#define _SPL_RPC_XDR_H
+
+#include <sys/types.h>
+
+typedef int bool_t;
+
+/*
+ * XDR enums and types.
+ */
+enum xdr_op {
+ XDR_ENCODE,
+ XDR_DECODE
+};
+
+struct xdr_ops;
+
+typedef struct {
+ struct xdr_ops *x_ops; /* Let caller know xdrmem_create() succeeds */
+ caddr_t x_addr; /* Current buffer addr */
+ caddr_t x_addr_end; /* End of the buffer */
+ enum xdr_op x_op; /* Stream direction */
+} XDR;
+
+typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr);
+
+struct xdr_ops {
+ bool_t (*xdr_control)(XDR *, int, void *);
+
+ bool_t (*xdr_char)(XDR *, char *);
+ bool_t (*xdr_u_short)(XDR *, unsigned short *);
+ bool_t (*xdr_u_int)(XDR *, unsigned *);
+ bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *);
+
+ bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t);
+ bool_t (*xdr_string)(XDR *, char **, const uint_t);
+ bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t,
+ const uint_t, const xdrproc_t);
+};
+
+/*
+ * XDR control operator.
+ */
+#define XDR_GET_BYTES_AVAIL 1
+
+struct xdr_bytesrec {
+ bool_t xc_is_last_record;
+ size_t xc_num_avail;
+};
+
+/*
+ * XDR functions.
+ */
+void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+ const enum xdr_op op);
+
+/* Currently not needed. If needed later, we'll add it to struct xdr_ops */
+#define xdr_destroy(xdrs) ((void) 0)
+
+#define xdr_control(xdrs, req, info) \
+ (xdrs)->x_ops->xdr_control((xdrs), (req), (info))
+
+/*
+ * For precaution, the following are defined as static inlines instead of macros
+ * to get some amount of type safety.
+ *
+ * Also, macros wouldn't work in the case where typecasting is done, because it
+ * must be possible to reference the functions' addresses by these names.
+ */
+static inline bool_t xdr_char(XDR *xdrs, char *cp)
+{
+ return (xdrs->x_ops->xdr_char(xdrs, cp));
+}
+
+static inline bool_t xdr_u_short(XDR *xdrs, unsigned short *usp)
+{
+ return (xdrs->x_ops->xdr_u_short(xdrs, usp));
+}
+
+static inline bool_t xdr_short(XDR *xdrs, short *sp)
+{
+ BUILD_BUG_ON(sizeof (short) != 2);
+ return (xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp));
+}
+
+static inline bool_t xdr_u_int(XDR *xdrs, unsigned *up)
+{
+ return (xdrs->x_ops->xdr_u_int(xdrs, up));
+}
+
+static inline bool_t xdr_int(XDR *xdrs, int *ip)
+{
+ BUILD_BUG_ON(sizeof (int) != 4);
+ return (xdrs->x_ops->xdr_u_int(xdrs, (unsigned *)ip));
+}
+
+static inline bool_t xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp)
+{
+ return (xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp));
+}
+
+static inline bool_t xdr_longlong_t(XDR *xdrs, longlong_t *llp)
+{
+ BUILD_BUG_ON(sizeof (longlong_t) != 8);
+ return (xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *)llp));
+}
+
+/*
+ * Fixed-length opaque data.
+ */
+static inline bool_t xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ return (xdrs->x_ops->xdr_opaque(xdrs, cp, cnt));
+}
+
+/*
+ * Variable-length string.
+ * The *sp buffer must have (maxsize + 1) bytes.
+ */
+static inline bool_t xdr_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ return (xdrs->x_ops->xdr_string(xdrs, sp, maxsize));
+}
+
+/*
+ * Variable-length arrays.
+ */
+static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep,
+ const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc)
+{
+ return xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize,
+ elproc);
+}
+
+#endif /* SPL_RPC_XDR_H */
diff --git a/include/spl/sys/acl.h b/include/spl/sys/acl.h
new file mode 100644
index 000000000..9fc79c025
--- /dev/null
+++ b/include/spl/sys/acl.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ACL_H
+#define _SPL_ACL_H
+
+#include <sys/types.h>
+
+typedef struct ace {
+ uid_t a_who;
+ uint32_t a_access_mask;
+ uint16_t a_flags;
+ uint16_t a_type;
+} ace_t;
+
+typedef struct ace_object {
+ uid_t a_who; /* uid or gid */
+ uint32_t a_access_mask; /* read,write,... */
+ uint16_t a_flags; /* see below */
+ uint16_t a_type; /* allow or deny */
+ uint8_t a_obj_type[16]; /* obj type */
+ uint8_t a_inherit_obj_type[16]; /* inherit obj */
+} ace_object_t;
+
+#define MAX_ACL_ENTRIES 1024
+
+#define ACE_READ_DATA 0x00000001
+#define ACE_LIST_DIRECTORY 0x00000001
+#define ACE_WRITE_DATA 0x00000002
+#define ACE_ADD_FILE 0x00000002
+#define ACE_APPEND_DATA 0x00000004
+#define ACE_ADD_SUBDIRECTORY 0x00000004
+#define ACE_READ_NAMED_ATTRS 0x00000008
+#define ACE_WRITE_NAMED_ATTRS 0x00000010
+#define ACE_EXECUTE 0x00000020
+#define ACE_DELETE_CHILD 0x00000040
+#define ACE_READ_ATTRIBUTES 0x00000080
+#define ACE_WRITE_ATTRIBUTES 0x00000100
+#define ACE_DELETE 0x00010000
+#define ACE_READ_ACL 0x00020000
+#define ACE_WRITE_ACL 0x00040000
+#define ACE_WRITE_OWNER 0x00080000
+#define ACE_SYNCHRONIZE 0x00100000
+
+#define ACE_FILE_INHERIT_ACE 0x0001
+#define ACE_DIRECTORY_INHERIT_ACE 0x0002
+#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004
+#define ACE_INHERIT_ONLY_ACE 0x0008
+#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010
+#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020
+#define ACE_IDENTIFIER_GROUP 0x0040
+#define ACE_INHERITED_ACE 0x0080
+#define ACE_OWNER 0x1000
+#define ACE_GROUP 0x2000
+#define ACE_EVERYONE 0x4000
+
+#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000
+#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001
+#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002
+#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003
+
+#define ACL_AUTO_INHERIT 0x0001
+#define ACL_PROTECTED 0x0002
+#define ACL_DEFAULTED 0x0004
+#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED)
+
+#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04
+#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05
+#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06
+#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07
+#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08
+#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09
+#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A
+#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B
+#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C
+#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D
+#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E
+#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F
+#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10
+
+#define ACE_ALL_TYPES 0x001F
+
+#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP)
+
+/* BEGIN CSTYLED */
+#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \
+ ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \
+ ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
+ ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
+/* END CSTYLED */
+
+#define VSA_ACE 0x0010
+#define VSA_ACECNT 0x0020
+#define VSA_ACE_ALLTYPES 0x0040
+#define VSA_ACE_ACLFLAGS 0x0080
+
+#endif /* _SPL_ACL_H */
diff --git a/include/spl/sys/atomic.h b/include/spl/sys/atomic.h
new file mode 100644
index 000000000..51b547923
--- /dev/null
+++ b/include/spl/sys/atomic.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ATOMIC_H
+#define _SPL_ATOMIC_H
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <sys/types.h>
+
+/*
+ * Map the atomic_* functions to the Linux counterparts. This relies on the
+ * fact that the atomic types are internally really a uint32 or uint64. If
+ * this were to change an alternate approach would be needed.
+ *
+ * N.B. Due to the limitations of the original API atomicity is not strictly
+ * preserved when using the 64-bit functions on a 32-bit system. In order
+ * to support this all consumers would need to be updated to use the Linux
+ * provided atomic_t and atomic64_t types.
+ */
+#define atomic_inc_32(v) atomic_inc((atomic_t *)(v))
+#define atomic_dec_32(v) atomic_dec((atomic_t *)(v))
+#define atomic_add_32(v, i) atomic_add((i), (atomic_t *)(v))
+#define atomic_sub_32(v, i) atomic_sub((i), (atomic_t *)(v))
+#define atomic_inc_32_nv(v) atomic_inc_return((atomic_t *)(v))
+#define atomic_dec_32_nv(v) atomic_dec_return((atomic_t *)(v))
+#define atomic_add_32_nv(v, i) atomic_add_return((i), (atomic_t *)(v))
+#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v))
+#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y)
+#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x)
+#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v))
+#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v))
+#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v))
+#define atomic_sub_64(v, i) atomic64_sub((i), (atomic64_t *)(v))
+#define atomic_inc_64_nv(v) atomic64_inc_return((atomic64_t *)(v))
+#define atomic_dec_64_nv(v) atomic64_dec_return((atomic64_t *)(v))
+#define atomic_add_64_nv(v, i) atomic64_add_return((i), (atomic64_t *)(v))
+#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v))
+#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y)
+#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x)
+
+#ifdef _LP64
+static __inline__ void *
+atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
+{
+ return ((void *)atomic_cas_64((volatile uint64_t *)target,
+ (uint64_t)cmp, (uint64_t)newval));
+}
+#else /* _LP64 */
+static __inline__ void *
+atomic_cas_ptr(volatile void *target, void *cmp, void *newval)
+{
+ return ((void *)atomic_cas_32((volatile uint32_t *)target,
+ (uint32_t)cmp, (uint32_t)newval));
+}
+#endif /* _LP64 */
+
+#endif /* _SPL_ATOMIC_H */
diff --git a/include/spl/sys/byteorder.h b/include/spl/sys/byteorder.h
new file mode 100644
index 000000000..477707996
--- /dev/null
+++ b/include/spl/sys/byteorder.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_BYTEORDER_H
+#define _SPL_BYTEORDER_H
+
+#include <asm/byteorder.h>
+#include <sys/isa_defs.h>
+
+#define BSWAP_8(x) ((x) & 0xff)
+#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8))
+#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16))
+#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32))
+
+#define LE_16(x) cpu_to_le16(x)
+#define LE_32(x) cpu_to_le32(x)
+#define LE_64(x) cpu_to_le64(x)
+#define BE_16(x) cpu_to_be16(x)
+#define BE_32(x) cpu_to_be32(x)
+#define BE_64(x) cpu_to_be64(x)
+
+#define BE_IN8(xa) \
+ *((uint8_t *)(xa))
+
+#define BE_IN16(xa) \
+ (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1))
+
+#define BE_IN32(xa) \
+ (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2))
+
+#ifdef _BIG_ENDIAN
+static __inline__ uint64_t
+htonll(uint64_t n)
+{
+ return (n);
+}
+
+static __inline__ uint64_t
+ntohll(uint64_t n)
+{
+ return (n);
+}
+#else
+static __inline__ uint64_t
+htonll(uint64_t n)
+{
+ return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32));
+}
+
+static __inline__ uint64_t
+ntohll(uint64_t n)
+{
+ return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32));
+}
+#endif
+
+#endif /* SPL_BYTEORDER_H */
diff --git a/include/spl/sys/callb.h b/include/spl/sys/callb.h
new file mode 100644
index 000000000..f1826bfd3
--- /dev/null
+++ b/include/spl/sys/callb.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CALLB_H
+#define _SPL_CALLB_H
+
+#include <linux/module.h>
+#include <sys/mutex.h>
+
+#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp));
+
+typedef struct callb_cpr {
+ kmutex_t *cc_lockp;
+} callb_cpr_t;
+
+#define CALLB_CPR_INIT(cp, lockp, func, name) { \
+ (cp)->cc_lockp = lockp; \
+}
+
+#define CALLB_CPR_SAFE_BEGIN(cp) { \
+ CALLB_CPR_ASSERT(cp); \
+}
+
+#define CALLB_CPR_SAFE_END(cp, lockp) { \
+ CALLB_CPR_ASSERT(cp); \
+}
+
+#define CALLB_CPR_EXIT(cp) { \
+ ASSERT(MUTEX_HELD((cp)->cc_lockp)); \
+ mutex_exit((cp)->cc_lockp); \
+}
+
+#endif /* _SPL_CALLB_H */
diff --git a/include/spl/sys/callo.h b/include/spl/sys/callo.h
new file mode 100644
index 000000000..c43ac92e7
--- /dev/null
+++ b/include/spl/sys/callo.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CALLO_H
+#define _SPL_CALLO_H
+
+/*
+ * Callout flags:
+ *
+ * CALLOUT_FLAG_ROUNDUP
+ * Roundup the expiration time to the next resolution boundary.
+ * If this flag is not specified, the expiration time is rounded down.
+ * CALLOUT_FLAG_ABSOLUTE
+ * Normally, the expiration passed to the timeout API functions is an
+ * expiration interval. If this flag is specified, then it is
+ * interpreted as the expiration time itself.
+ * CALLOUT_FLAG_HRESTIME
+ * Normally, callouts are not affected by changes to system time
+ * (hrestime). This flag is used to create a callout that is affected
+ * by system time. If system time changes, these timers must be
+ * handled in a special way (see callout.c). These are used by condition
+ * variables and LWP timers that need this behavior.
+ * CALLOUT_FLAG_32BIT
+ * Legacy interfaces timeout() and realtime_timeout() pass this flag
+ * to timeout_generic() to indicate that a 32-bit ID should be allocated.
+ */
+#define CALLOUT_FLAG_ROUNDUP 0x1
+#define CALLOUT_FLAG_ABSOLUTE 0x2
+#define CALLOUT_FLAG_HRESTIME 0x4
+#define CALLOUT_FLAG_32BIT 0x8
+
+#endif /* _SPL_CALLB_H */
diff --git a/include/spl/sys/cmn_err.h b/include/spl/sys/cmn_err.h
new file mode 100644
index 000000000..be57358b0
--- /dev/null
+++ b/include/spl/sys/cmn_err.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CMN_ERR_H
+#define _SPL_CMN_ERR_H
+
+#include <stdarg.h>
+
+#define CE_CONT 0 /* continuation */
+#define CE_NOTE 1 /* notice */
+#define CE_WARN 2 /* warning */
+#define CE_PANIC 3 /* panic */
+#define CE_IGNORE 4 /* print nothing */
+
+extern void cmn_err(int, const char *, ...);
+extern void vcmn_err(int, const char *, va_list);
+extern void vpanic(const char *, va_list);
+
+#define fm_panic panic
+
+#endif /* SPL_CMN_ERR_H */
diff --git a/include/spl/sys/condvar.h b/include/spl/sys/condvar.h
new file mode 100644
index 000000000..1d47cdd96
--- /dev/null
+++ b/include/spl/sys/condvar.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CONDVAR_H
+#define _SPL_CONDVAR_H
+
+#include <linux/module.h>
+#include <sys/kmem.h>
+#include <sys/mutex.h>
+#include <sys/callo.h>
+#include <sys/wait.h>
+
+/*
+ * The kcondvar_t struct is protected by mutex taken externally before
+ * calling any of the wait/signal funs, and passed into the wait funs.
+ */
+#define CV_MAGIC 0x346545f4
+#define CV_DESTROY 0x346545f5
+
+typedef struct {
+ int cv_magic;
+ spl_wait_queue_head_t cv_event;
+ spl_wait_queue_head_t cv_destroy;
+ atomic_t cv_refs;
+ atomic_t cv_waiters;
+ kmutex_t *cv_mutex;
+} kcondvar_t;
+
+typedef enum { CV_DEFAULT = 0, CV_DRIVER } kcv_type_t;
+
+extern void __cv_init(kcondvar_t *, char *, kcv_type_t, void *);
+extern void __cv_destroy(kcondvar_t *);
+extern void __cv_wait(kcondvar_t *, kmutex_t *);
+extern void __cv_wait_io(kcondvar_t *, kmutex_t *);
+extern void __cv_wait_sig(kcondvar_t *, kmutex_t *);
+extern clock_t __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t);
+extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t,
+ hrtime_t res, int flag);
+extern clock_t cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t,
+ hrtime_t res, int flag);
+extern void __cv_signal(kcondvar_t *);
+extern void __cv_broadcast(kcondvar_t *c);
+
+#define cv_init(cvp, name, type, arg) __cv_init(cvp, name, type, arg)
+#define cv_destroy(cvp) __cv_destroy(cvp)
+#define cv_wait(cvp, mp) __cv_wait(cvp, mp)
+#define cv_wait_io(cvp, mp) __cv_wait_io(cvp, mp)
+#define cv_wait_sig(cvp, mp) __cv_wait_sig(cvp, mp)
+#define cv_wait_interruptible(cvp, mp) cv_wait_sig(cvp, mp)
+#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t)
+#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t)
+#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t)
+#define cv_timedwait_interruptible(cvp, mp, t) cv_timedwait_sig(cvp, mp, t)
+#define cv_signal(cvp) __cv_signal(cvp)
+#define cv_broadcast(cvp) __cv_broadcast(cvp)
+
+#endif /* _SPL_CONDVAR_H */
diff --git a/include/spl/sys/console.h b/include/spl/sys/console.h
new file mode 100644
index 000000000..3469cb762
--- /dev/null
+++ b/include/spl/sys/console.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CONSOLE_H
+#define _SPL_CONSOLE_H
+
+void
+console_vprintf(const char *fmt, va_list args)
+{
+ vprintk(fmt, args);
+}
+
+void
+console_printf(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ console_vprintf(fmt, args);
+ va_end(args);
+}
+
+#endif /* _SPL_CONSOLE_H */
diff --git a/include/spl/sys/cred.h b/include/spl/sys/cred.h
new file mode 100644
index 000000000..fd063399b
--- /dev/null
+++ b/include/spl/sys/cred.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CRED_H
+#define _SPL_CRED_H
+
+#include <linux/module.h>
+#include <linux/cred.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+
+typedef struct cred cred_t;
+
+#define kcred ((cred_t *)(init_task.cred))
+#define CRED() ((cred_t *)current_cred())
+
+/* Linux 4.9 API change, GROUP_AT was removed */
+#ifndef GROUP_AT
+#define GROUP_AT(gi, i) ((gi)->gid[i])
+#endif
+
+#ifdef HAVE_KUIDGID_T
+
+#define KUID_TO_SUID(x) (__kuid_val(x))
+#define KGID_TO_SGID(x) (__kgid_val(x))
+#define SUID_TO_KUID(x) (KUIDT_INIT(x))
+#define SGID_TO_KGID(x) (KGIDT_INIT(x))
+#define KGIDP_TO_SGIDP(x) (&(x)->val)
+
+#else /* HAVE_KUIDGID_T */
+
+#define KUID_TO_SUID(x) (x)
+#define KGID_TO_SGID(x) (x)
+#define SUID_TO_KUID(x) (x)
+#define SGID_TO_KGID(x) (x)
+#define KGIDP_TO_SGIDP(x) (x)
+
+#endif /* HAVE_KUIDGID_T */
+
+extern void crhold(cred_t *cr);
+extern void crfree(cred_t *cr);
+extern uid_t crgetuid(const cred_t *cr);
+extern uid_t crgetruid(const cred_t *cr);
+extern uid_t crgetsuid(const cred_t *cr);
+extern uid_t crgetfsuid(const cred_t *cr);
+extern gid_t crgetgid(const cred_t *cr);
+extern gid_t crgetrgid(const cred_t *cr);
+extern gid_t crgetsgid(const cred_t *cr);
+extern gid_t crgetfsgid(const cred_t *cr);
+extern int crgetngroups(const cred_t *cr);
+extern gid_t *crgetgroups(const cred_t *cr);
+extern int groupmember(gid_t gid, const cred_t *cr);
+
+#endif /* _SPL_CRED_H */
diff --git a/include/spl/sys/ctype.h b/include/spl/sys/ctype.h
new file mode 100644
index 000000000..18beb1daa
--- /dev/null
+++ b/include/spl/sys/ctype.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_CTYPE_H
+#define _SPL_CTYPE_H
+
+#include <linux/ctype.h>
+
+#endif /* SPL_CTYPE_H */
diff --git a/include/spl/sys/debug.h b/include/spl/sys/debug.h
new file mode 100644
index 000000000..a4a458066
--- /dev/null
+++ b/include/spl/sys/debug.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Available Solaris debug functions. All of the ASSERT() macros will be
+ * compiled out when NDEBUG is defined, this is the default behavior for
+ * the SPL. To enable assertions use the --enable-debug with configure.
+ * The VERIFY() functions are never compiled out and cannot be disabled.
+ *
+ * PANIC() - Panic the node and print message.
+ * ASSERT() - Assert X is true, if not panic.
+ * ASSERTV() - Wraps a variable declaration which is only used by ASSERT().
+ * ASSERT3B() - Assert boolean X OP Y is true, if not panic.
+ * ASSERT3S() - Assert signed X OP Y is true, if not panic.
+ * ASSERT3U() - Assert unsigned X OP Y is true, if not panic.
+ * ASSERT3P() - Assert pointer X OP Y is true, if not panic.
+ * ASSERT0() - Assert value is zero, if not panic.
+ * VERIFY() - Verify X is true, if not panic.
+ * VERIFY3B() - Verify boolean X OP Y is true, if not panic.
+ * VERIFY3S() - Verify signed X OP Y is true, if not panic.
+ * VERIFY3U() - Verify unsigned X OP Y is true, if not panic.
+ * VERIFY3P() - Verify pointer X OP Y is true, if not panic.
+ * VERIFY0() - Verify value is zero, if not panic.
+ */
+
+#ifndef _SPL_DEBUG_H
+#define _SPL_DEBUG_H
+
+/*
+ * Common DEBUG functionality.
+ */
+int spl_panic(const char *file, const char *func, int line,
+ const char *fmt, ...);
+void spl_dumpstack(void);
+
+/* BEGIN CSTYLED */
+#define PANIC(fmt, a...) \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a)
+
+#define VERIFY(cond) \
+ (void) (unlikely(!(cond)) && \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "%s", "VERIFY(" #cond ") failed\n"))
+
+#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) do { \
+ TYPE _verify3_left = (TYPE)(LEFT); \
+ TYPE _verify3_right = (TYPE)(RIGHT); \
+ if (!(_verify3_left OP _verify3_right)) \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \
+ "failed (" FMT " " #OP " " FMT ")\n", \
+ CAST (_verify3_left), CAST (_verify3_right)); \
+ } while (0)
+
+#define VERIFY3B(x,y,z) VERIFY3_IMPL(x, y, z, boolean_t, "%d", (boolean_t))
+#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long))
+#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \
+ (unsigned long long))
+#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *))
+#define VERIFY0(x) VERIFY3_IMPL(0, ==, x, int64_t, "%lld", (long long))
+
+#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__)
+#define CTASSERT(x) { _CTASSERT(x, __LINE__); }
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) \
+ typedef char __attribute__ ((unused)) \
+ __compile_time_assertion__ ## y[(x) ? 1 : -1]
+
+/*
+ * Debugging disabled (--disable-debug)
+ */
+#ifdef NDEBUG
+
+#define SPL_DEBUG_STR ""
+#define ASSERT(x) ((void)0)
+#define ASSERTV(x)
+#define ASSERT3B(x,y,z) ((void)0)
+#define ASSERT3S(x,y,z) ((void)0)
+#define ASSERT3U(x,y,z) ((void)0)
+#define ASSERT3P(x,y,z) ((void)0)
+#define ASSERT0(x) ((void)0)
+#define IMPLY(A, B) ((void)0)
+#define EQUIV(A, B) ((void)0)
+
+/*
+ * Debugging enabled (--enable-debug)
+ */
+#else
+
+#define SPL_DEBUG_STR " (DEBUG mode)"
+#define ASSERT(cond) VERIFY(cond)
+#define ASSERTV(x) x
+#define ASSERT3B(x,y,z) VERIFY3B(x, y, z)
+#define ASSERT3S(x,y,z) VERIFY3S(x, y, z)
+#define ASSERT3U(x,y,z) VERIFY3U(x, y, z)
+#define ASSERT3P(x,y,z) VERIFY3P(x, y, z)
+#define ASSERT0(x) VERIFY0(x)
+#define IMPLY(A, B) \
+ ((void)(((!(A)) || (B)) || \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "(" #A ") implies (" #B ")")))
+#define EQUIV(A, B) \
+ ((void)((!!(A) == !!(B)) || \
+ spl_panic(__FILE__, __FUNCTION__, __LINE__, \
+ "(" #A ") is equivalent to (" #B ")")))
+/* END CSTYLED */
+
+#endif /* NDEBUG */
+
+#endif /* SPL_DEBUG_H */
diff --git a/include/spl/sys/disp.h b/include/spl/sys/disp.h
new file mode 100644
index 000000000..413b623c8
--- /dev/null
+++ b/include/spl/sys/disp.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_DISP_H
+#define _SPL_DISP_H
+
+#include <linux/preempt.h>
+
+#define kpreempt(unused) schedule()
+#define kpreempt_disable() preempt_disable()
+#define kpreempt_enable() preempt_enable()
+
+#endif /* SPL_DISP_H */
diff --git a/include/spl/sys/dkio.h b/include/spl/sys/dkio.h
new file mode 100644
index 000000000..49f166a9c
--- /dev/null
+++ b/include/spl/sys/dkio.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_DKIO_H
+#define _SPL_DKIO_H
+
+#define DFL_SZ(num_exts) \
+ (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16)
+
+#define DKIOC (0x04 << 8)
+#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */
+
+/*
+ * ioctl to free space (e.g. SCSI UNMAP) off a disk.
+ * Pass a dkioc_free_list_t containing a list of extents to be freed.
+ */
+#define DKIOCFREE (DKIOC|50)
+
+#endif /* _SPL_DKIO_H */
diff --git a/include/spl/sys/dkioc_free_util.h b/include/spl/sys/dkioc_free_util.h
new file mode 100644
index 000000000..d519b2f8e
--- /dev/null
+++ b/include/spl/sys/dkioc_free_util.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_DKIOC_UTIL_H
+#define _SPL_DKIOC_UTIL_H
+
+#include <sys/dkio.h>
+
+typedef struct dkioc_free_list_ext_s {
+ uint64_t dfle_start;
+ uint64_t dfle_length;
+} dkioc_free_list_ext_t;
+
+typedef struct dkioc_free_list_s {
+ uint64_t dfl_flags;
+ uint64_t dfl_num_exts;
+ int64_t dfl_offset;
+
+ /*
+ * N.B. this is only an internal debugging API! This is only called
+ * from debug builds of sd for pre-release checking. Remove before GA!
+ */
+ void (*dfl_ck_func)(uint64_t, uint64_t, void *);
+ void *dfl_ck_arg;
+
+ dkioc_free_list_ext_t dfl_exts[1];
+} dkioc_free_list_t;
+
+static inline void dfl_free(dkioc_free_list_t *dfl) {
+ vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts));
+}
+
+static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) {
+ return (vmem_zalloc(DFL_SZ(dfl_num_exts), flags));
+}
+
+#endif /* _SPL_DKIOC_UTIL_H */
diff --git a/include/spl/sys/fcntl.h b/include/spl/sys/fcntl.h
new file mode 100644
index 000000000..3faa5dad7
--- /dev/null
+++ b/include/spl/sys/fcntl.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_FCNTL_H
+#define _SPL_FCNTL_H
+
+#include <asm/fcntl.h>
+
+#define F_FREESP 11
+
+#ifdef CONFIG_64BIT
+typedef struct flock flock64_t;
+#else
+typedef struct flock64 flock64_t;
+#endif /* CONFIG_64BIT */
+
+#endif /* _SPL_FCNTL_H */
diff --git a/include/spl/sys/file.h b/include/spl/sys/file.h
new file mode 100644
index 000000000..05dbc0814
--- /dev/null
+++ b/include/spl/sys/file.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_FILE_H
+#define _SPL_FILE_H
+
+#define FIGNORECASE 0x00080000
+#define FKIOCTL 0x80000000
+#define ED_CASE_CONFLICT 0x10
+
+#ifdef HAVE_INODE_LOCK_SHARED
+#define spl_inode_lock(ip) inode_lock(ip)
+#define spl_inode_unlock(ip) inode_unlock(ip)
+#define spl_inode_lock_shared(ip) inode_lock_shared(ip)
+#define spl_inode_unlock_shared(ip) inode_unlock_shared(ip)
+#define spl_inode_trylock(ip) inode_trylock(ip)
+#define spl_inode_trylock_shared(ip) inode_trylock_shared(ip)
+#define spl_inode_is_locked(ip) inode_is_locked(ip)
+#define spl_inode_lock_nested(ip, s) inode_lock_nested(ip, s)
+#else
+#define spl_inode_lock(ip) mutex_lock(&(ip)->i_mutex)
+#define spl_inode_unlock(ip) mutex_unlock(&(ip)->i_mutex)
+#define spl_inode_lock_shared(ip) mutex_lock(&(ip)->i_mutex)
+#define spl_inode_unlock_shared(ip) mutex_unlock(&(ip)->i_mutex)
+#define spl_inode_trylock(ip) mutex_trylock(&(ip)->i_mutex)
+#define spl_inode_trylock_shared(ip) mutex_trylock(&(ip)->i_mutex)
+#define spl_inode_is_locked(ip) mutex_is_locked(&(ip)->i_mutex)
+#define spl_inode_lock_nested(ip, s) mutex_lock_nested(&(ip)->i_mutex, s)
+#endif
+
+#endif /* SPL_FILE_H */
diff --git a/include/spl/sys/inttypes.h b/include/spl/sys/inttypes.h
new file mode 100644
index 000000000..92e76206b
--- /dev/null
+++ b/include/spl/sys/inttypes.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_INTTYPES_H
+#define _SPL_INTTYPES_H
+
+#endif /* SPL_INTTYPES_H */
diff --git a/include/spl/sys/isa_defs.h b/include/spl/sys/isa_defs.h
new file mode 100644
index 000000000..089ae0f85
--- /dev/null
+++ b/include/spl/sys/isa_defs.h
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ISA_DEFS_H
+#define _SPL_ISA_DEFS_H
+
+/* x86_64 arch specific defines */
+#if defined(__x86_64) || defined(__x86_64__)
+
+#if !defined(__x86_64)
+#define __x86_64
+#endif
+
+#if !defined(__amd64)
+#define __amd64
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+#if !defined(_LP64)
+#define _LP64
+#endif
+
+#define _ALIGNMENT_REQUIRED 1
+
+
+/* i386 arch specific defines */
+#elif defined(__i386) || defined(__i386__)
+
+#if !defined(__i386)
+#define __i386
+#endif
+
+#if !defined(__x86)
+#define __x86
+#endif
+
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+
+#define _ALIGNMENT_REQUIRED 0
+
+/* powerpc (ppc64) arch specific defines */
+#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__)
+
+#if !defined(__powerpc)
+#define __powerpc
+#endif
+
+#if !defined(__powerpc__)
+#define __powerpc__
+#endif
+
+#if defined(__powerpc64__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for PPC, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+/* arm arch specific defines */
+#elif defined(__arm) || defined(__arm__) || defined(__aarch64__)
+
+#if !defined(__arm)
+#define __arm
+#endif
+
+#if !defined(__arm__)
+#define __arm__
+#endif
+
+#if defined(__aarch64__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+#if defined(__ARMEL__) || defined(__AARCH64EL__)
+#define _LITTLE_ENDIAN
+#else
+#define _BIG_ENDIAN
+#endif
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for ARM, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+/* sparc arch specific defines */
+#elif defined(__sparc) || defined(__sparc__)
+
+#if !defined(__sparc)
+#define __sparc
+#endif
+
+#if !defined(__sparc__)
+#define __sparc__
+#endif
+
+#if defined(__arch64__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+#define _BIG_ENDIAN
+#define _SUNOS_VTOC_16
+#define _ALIGNMENT_REQUIRED 1
+
+/* s390 arch specific defines */
+#elif defined(__s390__)
+#if defined(__s390x__)
+#if !defined(_LP64)
+#define _LP64
+#endif
+#else
+#if !defined(_ILP32)
+#define _ILP32
+#endif
+#endif
+
+#define _BIG_ENDIAN
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for s390, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+/* MIPS arch specific defines */
+#elif defined(__mips__)
+
+#if defined(__MIPSEB__)
+#define _BIG_ENDIAN
+#elif defined(__MIPSEL__)
+#define _LITTLE_ENDIAN
+#else
+#error MIPS no endian specified
+#endif
+
+#ifndef _LP64
+#define _ILP32
+#endif
+
+#define _SUNOS_VTOC_16
+
+/*
+ * Illumos doesn't define _ALIGNMENT_REQUIRED for MIPS, so default to 1
+ * out of paranoia.
+ */
+#define _ALIGNMENT_REQUIRED 1
+
+#else
+/*
+ * Currently supported:
+ * x86_64, i386, arm, powerpc, s390, sparc, and mips
+ */
+#error "Unsupported ISA type"
+#endif
+
+#if defined(_ILP32) && defined(_LP64)
+#error "Both _ILP32 and _LP64 are defined"
+#endif
+
+#if !defined(_ILP32) && !defined(_LP64)
+#error "Neither _ILP32 or _LP64 are defined"
+#endif
+
+#include <sys/byteorder.h>
+
+#if defined(__LITTLE_ENDIAN) && !defined(_LITTLE_ENDIAN)
+#define _LITTLE_ENDIAN __LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN) && !defined(_BIG_ENDIAN)
+#define _BIG_ENDIAN __BIG_ENDIAN
+#endif
+
+#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN)
+#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined"
+#endif
+
+#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN)
+#error "Neither _LITTLE_ENDIAN or _BIG_ENDIAN are defined"
+#endif
+
+#endif /* _SPL_ISA_DEFS_H */
diff --git a/include/spl/sys/kmem.h b/include/spl/sys/kmem.h
new file mode 100644
index 000000000..d6b428551
--- /dev/null
+++ b/include/spl/sys/kmem.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KMEM_H
+#define _SPL_KMEM_H
+
+#include <sys/debug.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+extern int kmem_debugging(void);
+extern char *kmem_vasprintf(const char *fmt, va_list ap);
+extern char *kmem_asprintf(const char *fmt, ...);
+extern char *strdup(const char *str);
+extern void strfree(char *str);
+
+/*
+ * Memory allocation interfaces
+ */
+#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */
+#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */
+#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */
+#define KM_ZERO 0x1000 /* zero the allocation */
+#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */
+
+#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE)
+
+static int spl_fstrans_check(void);
+
+/*
+ * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion
+ * function is context aware which means that KM_SLEEP allocations can be
+ * safely used in syncing contexts which have set PF_FSTRANS.
+ */
+static inline gfp_t
+kmem_flags_convert(int flags)
+{
+ gfp_t lflags = __GFP_NOWARN | __GFP_COMP;
+
+ if (flags & KM_NOSLEEP) {
+ lflags |= GFP_ATOMIC | __GFP_NORETRY;
+ } else {
+ lflags |= GFP_KERNEL;
+ if (spl_fstrans_check())
+ lflags &= ~(__GFP_IO|__GFP_FS);
+ }
+
+ if (flags & KM_PUSHPAGE)
+ lflags |= __GFP_HIGH;
+
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
+
+ return (lflags);
+}
+
+typedef struct {
+ struct task_struct *fstrans_thread;
+ unsigned int saved_flags;
+} fstrans_cookie_t;
+
+/*
+ * Introduced in Linux 3.9, however this cannot be solely relied on before
+ * Linux 3.18 as it doesn't turn off __GFP_FS as it should.
+ */
+#ifdef PF_MEMALLOC_NOIO
+#define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO)
+#else
+#define __SPL_PF_MEMALLOC_NOIO (0)
+#endif
+
+/*
+ * PF_FSTRANS is removed from Linux 4.12
+ */
+#ifdef PF_FSTRANS
+#define __SPL_PF_FSTRANS (PF_FSTRANS)
+#else
+#define __SPL_PF_FSTRANS (0)
+#endif
+
+#define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO)
+
+static inline fstrans_cookie_t
+spl_fstrans_mark(void)
+{
+ fstrans_cookie_t cookie;
+
+ BUILD_BUG_ON(SPL_FSTRANS == 0);
+
+ cookie.fstrans_thread = current;
+ cookie.saved_flags = current->flags & SPL_FSTRANS;
+ current->flags |= SPL_FSTRANS;
+
+ return (cookie);
+}
+
+static inline void
+spl_fstrans_unmark(fstrans_cookie_t cookie)
+{
+ ASSERT3P(cookie.fstrans_thread, ==, current);
+ ASSERT((current->flags & SPL_FSTRANS) == SPL_FSTRANS);
+
+ current->flags &= ~SPL_FSTRANS;
+ current->flags |= cookie.saved_flags;
+}
+
+static inline int
+spl_fstrans_check(void)
+{
+ return (current->flags & SPL_FSTRANS);
+}
+
+/*
+ * specifically used to check PF_FSTRANS flag, cannot be relied on for
+ * checking spl_fstrans_mark().
+ */
+static inline int
+__spl_pf_fstrans_check(void)
+{
+ return (current->flags & __SPL_PF_FSTRANS);
+}
+
+#ifdef HAVE_ATOMIC64_T
+#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used)
+#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used)
+#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used)
+#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size)
+extern atomic64_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+#else /* HAVE_ATOMIC64_T */
+#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used)
+#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used)
+#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used)
+#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size)
+extern atomic_t kmem_alloc_used;
+extern unsigned long long kmem_alloc_max;
+#endif /* HAVE_ATOMIC64_T */
+
+extern unsigned int spl_kmem_alloc_warn;
+extern unsigned int spl_kmem_alloc_max;
+
+#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__)
+#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__)
+#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz))
+
+extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_kmem_free(const void *ptr, size_t sz);
+
+/*
+ * The following functions are only available for internal use.
+ */
+extern void *spl_kmem_alloc_impl(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_debug(size_t size, int flags, int node);
+extern void *spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node);
+extern void spl_kmem_free_impl(const void *buf, size_t size);
+extern void spl_kmem_free_debug(const void *buf, size_t size);
+extern void spl_kmem_free_track(const void *buf, size_t size);
+
+extern int spl_kmem_init(void);
+extern void spl_kmem_fini(void);
+
+#endif /* _SPL_KMEM_H */
diff --git a/include/spl/sys/kmem_cache.h b/include/spl/sys/kmem_cache.h
new file mode 100644
index 000000000..8fa14f67e
--- /dev/null
+++ b/include/spl/sys/kmem_cache.h
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KMEM_CACHE_H
+#define _SPL_KMEM_CACHE_H
+
+#include <sys/taskq.h>
+
+/*
+ * Slab allocation interfaces. The SPL slab differs from the standard
+ * Linux SLAB or SLUB primarily in that each cache may be backed by slabs
+ * allocated from the physical or virtal memory address space. The virtual
+ * slabs allow for good behavior when allocation large objects of identical
+ * size. This slab implementation also supports both constructors and
+ * destructors which the Linux slab does not.
+ */
+enum {
+ KMC_BIT_NOTOUCH = 0, /* Don't update ages */
+ KMC_BIT_NODEBUG = 1, /* Default behavior */
+ KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */
+ KMC_BIT_NOHASH = 3, /* XXX: Unsupported */
+ KMC_BIT_QCACHE = 4, /* XXX: Unsupported */
+ KMC_BIT_KMEM = 5, /* Use kmem cache */
+ KMC_BIT_VMEM = 6, /* Use vmem cache */
+ KMC_BIT_SLAB = 7, /* Use Linux slab cache */
+ KMC_BIT_OFFSLAB = 8, /* Objects not on slab */
+ KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */
+ KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */
+ KMC_BIT_GROWING = 15, /* Growing in progress */
+ KMC_BIT_REAPING = 16, /* Reaping in progress */
+ KMC_BIT_DESTROY = 17, /* Destroy in progress */
+ KMC_BIT_TOTAL = 18, /* Proc handler helper bit */
+ KMC_BIT_ALLOC = 19, /* Proc handler helper bit */
+ KMC_BIT_MAX = 20, /* Proc handler helper bit */
+};
+
+/* kmem move callback return values */
+typedef enum kmem_cbrc {
+ KMEM_CBRC_YES = 0, /* Object moved */
+ KMEM_CBRC_NO = 1, /* Object not moved */
+ KMEM_CBRC_LATER = 2, /* Object not moved, try again later */
+ KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */
+ KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */
+} kmem_cbrc_t;
+
+#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH)
+#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG)
+#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE)
+#define KMC_NOHASH (1 << KMC_BIT_NOHASH)
+#define KMC_QCACHE (1 << KMC_BIT_QCACHE)
+#define KMC_KMEM (1 << KMC_BIT_KMEM)
+#define KMC_VMEM (1 << KMC_BIT_VMEM)
+#define KMC_SLAB (1 << KMC_BIT_SLAB)
+#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB)
+#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY)
+#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED)
+#define KMC_GROWING (1 << KMC_BIT_GROWING)
+#define KMC_REAPING (1 << KMC_BIT_REAPING)
+#define KMC_DESTROY (1 << KMC_BIT_DESTROY)
+#define KMC_TOTAL (1 << KMC_BIT_TOTAL)
+#define KMC_ALLOC (1 << KMC_BIT_ALLOC)
+#define KMC_MAX (1 << KMC_BIT_MAX)
+
+#define KMC_REAP_CHUNK INT_MAX
+#define KMC_DEFAULT_SEEKS 1
+
+#define KMC_EXPIRE_AGE 0x1 /* Due to age */
+#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */
+
+#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */
+
+extern unsigned int spl_kmem_cache_expire;
+extern struct list_head spl_kmem_cache_list;
+extern struct rw_semaphore spl_kmem_cache_sem;
+
+#define SKM_MAGIC 0x2e2e2e2e
+#define SKO_MAGIC 0x20202020
+#define SKS_MAGIC 0x22222222
+#define SKC_MAGIC 0x2c2c2c2c
+
+#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */
+#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */
+#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */
+#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */
+#ifdef _LP64
+#define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */
+#else
+#define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */
+#endif
+
+#define SPL_MAX_ORDER (MAX_ORDER - 3)
+#define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1))
+
+#ifdef CONFIG_SLUB
+#define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER
+#define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1))
+#else
+#define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT)
+#endif
+
+#define POINTER_IS_VALID(p) 0 /* Unimplemented */
+#define POINTER_INVALIDATE(pp) /* Unimplemented */
+
+typedef int (*spl_kmem_ctor_t)(void *, void *, int);
+typedef void (*spl_kmem_dtor_t)(void *, void *);
+typedef void (*spl_kmem_reclaim_t)(void *);
+
+typedef struct spl_kmem_magazine {
+ uint32_t skm_magic; /* Sanity magic */
+ uint32_t skm_avail; /* Available objects */
+ uint32_t skm_size; /* Magazine size */
+ uint32_t skm_refill; /* Batch refill size */
+ struct spl_kmem_cache *skm_cache; /* Owned by cache */
+ unsigned long skm_age; /* Last cache access */
+ unsigned int skm_cpu; /* Owned by cpu */
+ void *skm_objs[0]; /* Object pointers */
+} spl_kmem_magazine_t;
+
+typedef struct spl_kmem_obj {
+ uint32_t sko_magic; /* Sanity magic */
+ void *sko_addr; /* Buffer address */
+ struct spl_kmem_slab *sko_slab; /* Owned by slab */
+ struct list_head sko_list; /* Free object list linkage */
+} spl_kmem_obj_t;
+
+typedef struct spl_kmem_slab {
+ uint32_t sks_magic; /* Sanity magic */
+ uint32_t sks_objs; /* Objects per slab */
+ struct spl_kmem_cache *sks_cache; /* Owned by cache */
+ struct list_head sks_list; /* Slab list linkage */
+ struct list_head sks_free_list; /* Free object list */
+ unsigned long sks_age; /* Last modify jiffie */
+ uint32_t sks_ref; /* Ref count used objects */
+} spl_kmem_slab_t;
+
+typedef struct spl_kmem_alloc {
+ struct spl_kmem_cache *ska_cache; /* Owned by cache */
+ int ska_flags; /* Allocation flags */
+ taskq_ent_t ska_tqe; /* Task queue entry */
+} spl_kmem_alloc_t;
+
+typedef struct spl_kmem_emergency {
+ struct rb_node ske_node; /* Emergency tree linkage */
+ unsigned long ske_obj; /* Buffer address */
+} spl_kmem_emergency_t;
+
+typedef struct spl_kmem_cache {
+ uint32_t skc_magic; /* Sanity magic */
+ uint32_t skc_name_size; /* Name length */
+ char *skc_name; /* Name string */
+ spl_kmem_magazine_t **skc_mag; /* Per-CPU warm cache */
+ uint32_t skc_mag_size; /* Magazine size */
+ uint32_t skc_mag_refill; /* Magazine refill count */
+ spl_kmem_ctor_t skc_ctor; /* Constructor */
+ spl_kmem_dtor_t skc_dtor; /* Destructor */
+ spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */
+ void *skc_private; /* Private data */
+ void *skc_vmp; /* Unused */
+ struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */
+ unsigned long skc_flags; /* Flags */
+ uint32_t skc_obj_size; /* Object size */
+ uint32_t skc_obj_align; /* Object alignment */
+ uint32_t skc_slab_objs; /* Objects per slab */
+ uint32_t skc_slab_size; /* Slab size */
+ uint32_t skc_delay; /* Slab reclaim interval */
+ uint32_t skc_reap; /* Slab reclaim count */
+ atomic_t skc_ref; /* Ref count callers */
+ taskqid_t skc_taskqid; /* Slab reclaim task */
+ struct list_head skc_list; /* List of caches linkage */
+ struct list_head skc_complete_list; /* Completely alloc'ed */
+ struct list_head skc_partial_list; /* Partially alloc'ed */
+ struct rb_root skc_emergency_tree; /* Min sized objects */
+ spinlock_t skc_lock; /* Cache lock */
+ spl_wait_queue_head_t skc_waitq; /* Allocation waiters */
+ uint64_t skc_slab_fail; /* Slab alloc failures */
+ uint64_t skc_slab_create; /* Slab creates */
+ uint64_t skc_slab_destroy; /* Slab destroys */
+ uint64_t skc_slab_total; /* Slab total current */
+ uint64_t skc_slab_alloc; /* Slab alloc current */
+ uint64_t skc_slab_max; /* Slab max historic */
+ uint64_t skc_obj_total; /* Obj total current */
+ uint64_t skc_obj_alloc; /* Obj alloc current */
+ uint64_t skc_obj_max; /* Obj max historic */
+ uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */
+ uint64_t skc_obj_emergency; /* Obj emergency current */
+ uint64_t skc_obj_emergency_max; /* Obj emergency max */
+} spl_kmem_cache_t;
+#define kmem_cache_t spl_kmem_cache_t
+
+extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size,
+ size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor,
+ spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags);
+extern void spl_kmem_cache_set_move(spl_kmem_cache_t *,
+ kmem_cbrc_t (*)(void *, void *, size_t, void *));
+extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc);
+extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags);
+extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj);
+extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags);
+extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count);
+extern void spl_kmem_reap(void);
+
+#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \
+ spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl)
+#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move)
+#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc)
+#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags)
+#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj)
+#define kmem_cache_reap_now(skc) \
+ spl_kmem_cache_reap_now(skc, skc->skc_reap)
+#define kmem_reap() spl_kmem_reap()
+
+/*
+ * The following functions are only available for internal use.
+ */
+extern int spl_kmem_cache_init(void);
+extern void spl_kmem_cache_fini(void);
+
+#endif /* _SPL_KMEM_CACHE_H */
diff --git a/include/spl/sys/kobj.h b/include/spl/sys/kobj.h
new file mode 100644
index 000000000..558ec39a8
--- /dev/null
+++ b/include/spl/sys/kobj.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KOBJ_H
+#define _SPL_KOBJ_H
+
+#include <sys/vnode.h>
+
+typedef struct _buf {
+ vnode_t *vp;
+} _buf_t;
+
+typedef struct _buf buf_t;
+
+extern struct _buf *kobj_open_file(const char *name);
+extern void kobj_close_file(struct _buf *file);
+extern int kobj_read_file(struct _buf *file, char *buf, unsigned size,
+ unsigned off);
+extern int kobj_get_filesize(struct _buf *file, uint64_t *size);
+
+#endif /* SPL_KOBJ_H */
diff --git a/include/spl/sys/kstat.h b/include/spl/sys/kstat.h
new file mode 100644
index 000000000..9170fe24e
--- /dev/null
+++ b/include/spl/sys/kstat.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_KSTAT_H
+#define _SPL_KSTAT_H
+
+#include <linux/module.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/kmem.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+
+#define KSTAT_STRLEN 255
+#define KSTAT_RAW_MAX (128*1024)
+
+/*
+ * For reference valid classes are:
+ * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc
+ */
+
+#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */
+#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */
+#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */
+#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */
+#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */
+#define KSTAT_NUM_TYPES 5
+
+#define KSTAT_DATA_CHAR 0
+#define KSTAT_DATA_INT32 1
+#define KSTAT_DATA_UINT32 2
+#define KSTAT_DATA_INT64 3
+#define KSTAT_DATA_UINT64 4
+#define KSTAT_DATA_LONG 5
+#define KSTAT_DATA_ULONG 6
+#define KSTAT_DATA_STRING 7
+#define KSTAT_NUM_DATAS 8
+
+#define KSTAT_INTR_HARD 0
+#define KSTAT_INTR_SOFT 1
+#define KSTAT_INTR_WATCHDOG 2
+#define KSTAT_INTR_SPURIOUS 3
+#define KSTAT_INTR_MULTSVC 4
+#define KSTAT_NUM_INTRS 5
+
+#define KSTAT_FLAG_VIRTUAL 0x01
+#define KSTAT_FLAG_VAR_SIZE 0x02
+#define KSTAT_FLAG_WRITABLE 0x04
+#define KSTAT_FLAG_PERSISTENT 0x08
+#define KSTAT_FLAG_DORMANT 0x10
+#define KSTAT_FLAG_UNSUPPORTED \
+ (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \
+ KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT)
+
+
+#define KS_MAGIC 0x9d9d9d9d
+
+/* Dynamic updates */
+#define KSTAT_READ 0
+#define KSTAT_WRITE 1
+
+struct kstat_s;
+typedef struct kstat_s kstat_t;
+
+typedef int kid_t; /* unique kstat id */
+typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */
+
+typedef struct kstat_module {
+ char ksm_name[KSTAT_STRLEN+1]; /* module name */
+ struct list_head ksm_module_list; /* module linkage */
+ struct list_head ksm_kstat_list; /* list of kstat entries */
+ struct proc_dir_entry *ksm_proc; /* proc entry */
+} kstat_module_t;
+
+typedef struct kstat_raw_ops {
+ int (*headers)(char *buf, size_t size);
+ int (*data)(char *buf, size_t size, void *data);
+ void *(*addr)(kstat_t *ksp, loff_t index);
+} kstat_raw_ops_t;
+
+struct kstat_s {
+ int ks_magic; /* magic value */
+ kid_t ks_kid; /* unique kstat ID */
+ hrtime_t ks_crtime; /* creation time */
+ hrtime_t ks_snaptime; /* last access time */
+ char ks_module[KSTAT_STRLEN+1]; /* provider module name */
+ int ks_instance; /* provider module instance */
+ char ks_name[KSTAT_STRLEN+1]; /* kstat name */
+ char ks_class[KSTAT_STRLEN+1]; /* kstat class */
+ uchar_t ks_type; /* kstat data type */
+ uchar_t ks_flags; /* kstat flags */
+ void *ks_data; /* kstat type-specific data */
+ uint_t ks_ndata; /* # of data records */
+ size_t ks_data_size; /* size of kstat data section */
+ struct proc_dir_entry *ks_proc; /* proc linkage */
+ kstat_update_t *ks_update; /* dynamic updates */
+ void *ks_private; /* private data */
+ kmutex_t ks_private_lock; /* kstat private data lock */
+ kmutex_t *ks_lock; /* kstat data lock */
+ struct list_head ks_list; /* kstat linkage */
+ kstat_module_t *ks_owner; /* kstat module linkage */
+ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */
+ char *ks_raw_buf; /* buf used for raw ops */
+ size_t ks_raw_bufsize; /* size of raw ops buffer */
+};
+
+typedef struct kstat_named_s {
+ char name[KSTAT_STRLEN]; /* name of counter */
+ uchar_t data_type; /* data type */
+ union {
+ char c[16]; /* 128-bit int */
+ int32_t i32; /* 32-bit signed int */
+ uint32_t ui32; /* 32-bit unsigned int */
+ int64_t i64; /* 64-bit signed int */
+ uint64_t ui64; /* 64-bit unsigned int */
+ long l; /* native signed long */
+ ulong_t ul; /* native unsigned long */
+ struct {
+ union {
+ char *ptr; /* NULL-term string */
+ char __pad[8]; /* 64-bit padding */
+ } addr;
+ uint32_t len; /* # bytes for strlen + '\0' */
+ } string;
+ } value;
+} kstat_named_t;
+
+#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr)
+#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len)
+
+typedef struct kstat_intr {
+ uint_t intrs[KSTAT_NUM_INTRS];
+} kstat_intr_t;
+
+typedef struct kstat_io {
+ u_longlong_t nread; /* number of bytes read */
+ u_longlong_t nwritten; /* number of bytes written */
+ uint_t reads; /* number of read operations */
+ uint_t writes; /* number of write operations */
+ hrtime_t wtime; /* cumulative wait (pre-service) time */
+ hrtime_t wlentime; /* cumulative wait len*time product */
+ hrtime_t wlastupdate; /* last time wait queue changed */
+ hrtime_t rtime; /* cumulative run (service) time */
+ hrtime_t rlentime; /* cumulative run length*time product */
+ hrtime_t rlastupdate; /* last time run queue changed */
+ uint_t wcnt; /* count of elements in wait state */
+ uint_t rcnt; /* count of elements in run state */
+} kstat_io_t;
+
+typedef struct kstat_timer {
+ char name[KSTAT_STRLEN+1]; /* event name */
+ u_longlong_t num_events; /* number of events */
+ hrtime_t elapsed_time; /* cumulative elapsed time */
+ hrtime_t min_time; /* shortest event duration */
+ hrtime_t max_time; /* longest event duration */
+ hrtime_t start_time; /* previous event start time */
+ hrtime_t stop_time; /* previous event stop time */
+} kstat_timer_t;
+
+int spl_kstat_init(void);
+void spl_kstat_fini(void);
+
+extern void __kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void* (*addr)(kstat_t *ksp, loff_t index));
+
+extern kstat_t *__kstat_create(const char *ks_module, int ks_instance,
+ const char *ks_name, const char *ks_class, uchar_t ks_type,
+ uint_t ks_ndata, uchar_t ks_flags);
+
+extern void __kstat_install(kstat_t *ksp);
+extern void __kstat_delete(kstat_t *ksp);
+extern void kstat_waitq_enter(kstat_io_t *);
+extern void kstat_waitq_exit(kstat_io_t *);
+extern void kstat_runq_enter(kstat_io_t *);
+extern void kstat_runq_exit(kstat_io_t *);
+
+#define kstat_set_raw_ops(k, h, d, a) \
+ __kstat_set_raw_ops(k, h, d, a)
+#define kstat_create(m, i, n, c, t, s, f) \
+ __kstat_create(m, i, n, c, t, s, f)
+
+#define kstat_install(k) __kstat_install(k)
+#define kstat_delete(k) __kstat_delete(k)
+
+#endif /* _SPL_KSTAT_H */
diff --git a/include/spl/sys/list.h b/include/spl/sys/list.h
new file mode 100644
index 000000000..74b784e93
--- /dev/null
+++ b/include/spl/sys/list.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_LIST_H
+#define _SPL_LIST_H
+
+#include <sys/types.h>
+#include <linux/list.h>
+
+/*
+ * NOTE: I have implemented the Solaris list API in terms of the native
+ * linux API. This has certain advantages in terms of leveraging the linux
+ * list debugging infrastructure, but it also means that the internals of a
+ * list differ slightly than on Solaris. This is not a problem as long as
+ * all callers stick to the published API. The two major differences are:
+ *
+ * 1) A list_node_t is mapped to a linux list_head struct which changes
+ * the name of the list_next/list_prev pointers to next/prev respectively.
+ *
+ * 2) A list_node_t which is not attached to a list on Solaris is denoted
+ * by having its list_next/list_prev pointers set to NULL. Under linux
+ * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2
+ * respectively. At this moment this only impacts the implementation
+ * of the list_link_init() and list_link_active() functions.
+ */
+
+typedef struct list_head list_node_t;
+
+typedef struct list {
+ size_t list_size;
+ size_t list_offset;
+ list_node_t list_head;
+} list_t;
+
+#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+
+static inline int
+list_is_empty(list_t *list)
+{
+ return (list_empty(&list->list_head));
+}
+
+static inline void
+list_link_init(list_node_t *node)
+{
+ node->next = LIST_POISON1;
+ node->prev = LIST_POISON2;
+}
+
+static inline void
+list_create(list_t *list, size_t size, size_t offset)
+{
+ list->list_size = size;
+ list->list_offset = offset;
+ INIT_LIST_HEAD(&list->list_head);
+}
+
+static inline void
+list_destroy(list_t *list)
+{
+ list_del(&list->list_head);
+}
+
+static inline void
+list_insert_head(list_t *list, void *object)
+{
+ list_add(list_d2l(list, object), &list->list_head);
+}
+
+static inline void
+list_insert_tail(list_t *list, void *object)
+{
+ list_add_tail(list_d2l(list, object), &list->list_head);
+}
+
+static inline void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL)
+ list_insert_head(list, nobject);
+ else
+ list_add(list_d2l(list, nobject), list_d2l(list, object));
+}
+
+static inline void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+ if (object == NULL)
+ list_insert_tail(list, nobject);
+ else
+ list_add_tail(list_d2l(list, nobject), list_d2l(list, object));
+}
+
+static inline void
+list_remove(list_t *list, void *object)
+{
+ list_del(list_d2l(list, object));
+}
+
+static inline void *
+list_remove_head(list_t *list)
+{
+ list_node_t *head = list->list_head.next;
+ if (head == &list->list_head)
+ return (NULL);
+
+ list_del(head);
+ return (list_object(list, head));
+}
+
+static inline void *
+list_remove_tail(list_t *list)
+{
+ list_node_t *tail = list->list_head.prev;
+ if (tail == &list->list_head)
+ return (NULL);
+
+ list_del(tail);
+ return (list_object(list, tail));
+}
+
+static inline void *
+list_head(list_t *list)
+{
+ if (list_is_empty(list))
+ return (NULL);
+
+ return (list_object(list, list->list_head.next));
+}
+
+static inline void *
+list_tail(list_t *list)
+{
+ if (list_is_empty(list))
+ return (NULL);
+
+ return (list_object(list, list->list_head.prev));
+}
+
+static inline void *
+list_next(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->next != &list->list_head)
+ return (list_object(list, node->next));
+
+ return (NULL);
+}
+
+static inline void *
+list_prev(list_t *list, void *object)
+{
+ list_node_t *node = list_d2l(list, object);
+
+ if (node->prev != &list->list_head)
+ return (list_object(list, node->prev));
+
+ return (NULL);
+}
+
+static inline int
+list_link_active(list_node_t *node)
+{
+ return (node->next != LIST_POISON1) && (node->prev != LIST_POISON2);
+}
+
+static inline void
+spl_list_move_tail(list_t *dst, list_t *src)
+{
+ list_splice_init(&src->list_head, dst->list_head.prev);
+}
+
+#define list_move_tail(dst, src) spl_list_move_tail(dst, src)
+
+static inline void
+list_link_replace(list_node_t *old_node, list_node_t *new_node)
+{
+ new_node->next = old_node->next;
+ new_node->prev = old_node->prev;
+ old_node->prev->next = new_node;
+ old_node->next->prev = new_node;
+ list_link_init(old_node);
+}
+
+#endif /* SPL_LIST_H */
diff --git a/include/spl/sys/mode.h b/include/spl/sys/mode.h
new file mode 100644
index 000000000..02802d0d4
--- /dev/null
+++ b/include/spl/sys/mode.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_MODE_H
+#define _SPL_MODE_H
+
+#define IFTOVT(mode) vn_mode_to_vtype(mode)
+#define VTTOIF(vtype) vn_vtype_to_mode(vtype)
+#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT))
+
+#endif /* SPL_MODE_H */
diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h
new file mode 100644
index 000000000..f906d49d4
--- /dev/null
+++ b/include/spl/sys/mutex.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_MUTEX_H
+#define _SPL_MUTEX_H
+
+#include <sys/types.h>
+#include <linux/mutex.h>
+#include <linux/lockdep.h>
+
+typedef enum {
+ MUTEX_DEFAULT = 0,
+ MUTEX_SPIN = 1,
+ MUTEX_ADAPTIVE = 2,
+ MUTEX_NOLOCKDEP = 3
+} kmutex_type_t;
+
+typedef struct {
+ struct mutex m_mutex;
+ spinlock_t m_lock; /* used for serializing mutex_exit */
+ kthread_t *m_owner;
+#ifdef CONFIG_LOCKDEP
+ kmutex_type_t m_type;
+#endif /* CONFIG_LOCKDEP */
+} kmutex_t;
+
+#define MUTEX(mp) (&((mp)->m_mutex))
+
+static inline void
+spl_mutex_set_owner(kmutex_t *mp)
+{
+ mp->m_owner = current;
+}
+
+static inline void
+spl_mutex_clear_owner(kmutex_t *mp)
+{
+ mp->m_owner = NULL;
+}
+
+#define mutex_owner(mp) (ACCESS_ONCE((mp)->m_owner))
+#define mutex_owned(mp) (mutex_owner(mp) == current)
+#define MUTEX_HELD(mp) mutex_owned(mp)
+#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp))
+
+#ifdef CONFIG_LOCKDEP
+static inline void
+spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type)
+{
+ mp->m_type = type;
+}
+static inline void
+spl_mutex_lockdep_off_maybe(kmutex_t *mp) \
+{ \
+ if (mp && mp->m_type == MUTEX_NOLOCKDEP) \
+ lockdep_off(); \
+}
+static inline void
+spl_mutex_lockdep_on_maybe(kmutex_t *mp) \
+{ \
+ if (mp && mp->m_type == MUTEX_NOLOCKDEP) \
+ lockdep_on(); \
+}
+#else /* CONFIG_LOCKDEP */
+#define spl_mutex_set_type(mp, type)
+#define spl_mutex_lockdep_off_maybe(mp)
+#define spl_mutex_lockdep_on_maybe(mp)
+#endif /* CONFIG_LOCKDEP */
+
+/*
+ * The following functions must be a #define and not static inline.
+ * This ensures that the native linux mutex functions (lock/unlock)
+ * will be correctly located in the users code which is important
+ * for the built in kernel lock analysis tools
+ */
+#undef mutex_init
+#define mutex_init(mp, name, type, ibc) \
+{ \
+ static struct lock_class_key __key; \
+ ASSERT(type == MUTEX_DEFAULT || type == MUTEX_NOLOCKDEP); \
+ \
+ __mutex_init(MUTEX(mp), (name) ? (#name) : (#mp), &__key); \
+ spin_lock_init(&(mp)->m_lock); \
+ spl_mutex_clear_owner(mp); \
+ spl_mutex_set_type(mp, type); \
+}
+
+#undef mutex_destroy
+#define mutex_destroy(mp) \
+{ \
+ VERIFY3P(mutex_owner(mp), ==, NULL); \
+}
+
+/* BEGIN CSTYLED */
+#define mutex_tryenter(mp) \
+({ \
+ int _rc_; \
+ \
+ spl_mutex_lockdep_off_maybe(mp); \
+ if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \
+ spl_mutex_set_owner(mp); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ \
+ _rc_; \
+})
+/* END CSTYLED */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define mutex_enter_nested(mp, subclass) \
+{ \
+ ASSERT3P(mutex_owner(mp), !=, current); \
+ spl_mutex_lockdep_off_maybe(mp); \
+ mutex_lock_nested(MUTEX(mp), (subclass)); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ spl_mutex_set_owner(mp); \
+}
+#else /* CONFIG_DEBUG_LOCK_ALLOC */
+#define mutex_enter_nested(mp, subclass) \
+{ \
+ ASSERT3P(mutex_owner(mp), !=, current); \
+ spl_mutex_lockdep_off_maybe(mp); \
+ mutex_lock(MUTEX(mp)); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ spl_mutex_set_owner(mp); \
+}
+#endif /* CONFIG_DEBUG_LOCK_ALLOC */
+
+#define mutex_enter(mp) mutex_enter_nested((mp), 0)
+
+/*
+ * The reason for the spinlock:
+ *
+ * The Linux mutex is designed with a fast-path/slow-path design such that it
+ * does not guarantee serialization upon itself, allowing a race where latter
+ * acquirers finish mutex_unlock before former ones.
+ *
+ * The race renders it unsafe to be used for serializing the freeing of an
+ * object in which the mutex is embedded, where the latter acquirer could go
+ * on to free the object while the former one is still doing mutex_unlock and
+ * causing memory corruption.
+ *
+ * However, there are many places in ZFS where the mutex is used for
+ * serializing object freeing, and the code is shared among other OSes without
+ * this issue. Thus, we need the spinlock to force the serialization on
+ * mutex_exit().
+ *
+ * See http://lwn.net/Articles/575477/ for the information about the race.
+ */
+#define mutex_exit(mp) \
+{ \
+ spl_mutex_clear_owner(mp); \
+ spin_lock(&(mp)->m_lock); \
+ spl_mutex_lockdep_off_maybe(mp); \
+ mutex_unlock(MUTEX(mp)); \
+ spl_mutex_lockdep_on_maybe(mp); \
+ spin_unlock(&(mp)->m_lock); \
+ /* NOTE: do not dereference mp after this point */ \
+}
+
+int spl_mutex_init(void);
+void spl_mutex_fini(void);
+
+#endif /* _SPL_MUTEX_H */
diff --git a/include/spl/sys/param.h b/include/spl/sys/param.h
new file mode 100644
index 000000000..4ef929151
--- /dev/null
+++ b/include/spl/sys/param.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_PARAM_H
+#define _SPL_PARAM_H
+
+#include <asm/page.h>
+
+/* Pages to bytes and back */
+#define ptob(pages) ((pages) << PAGE_SHIFT)
+#define btop(bytes) ((bytes) >> PAGE_SHIFT)
+
+#define MAXUID UINT32_MAX
+
+#endif /* SPL_PARAM_H */
diff --git a/include/spl/sys/proc.h b/include/spl/sys/proc.h
new file mode 100644
index 000000000..287683920
--- /dev/null
+++ b/include/spl/sys/proc.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_PROC_H
+#define _SPL_PROC_H
+
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry *proc_spl_kstat;
+
+int spl_proc_init(void);
+void spl_proc_fini(void);
+
+#endif /* SPL_PROC_H */
diff --git a/include/spl/sys/processor.h b/include/spl/sys/processor.h
new file mode 100644
index 000000000..a70101fa2
--- /dev/null
+++ b/include/spl/sys/processor.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_PROCESSOR_H
+#define _SPL_PROCESSOR_H
+
+#define getcpuid() smp_processor_id()
+
+typedef int processorid_t;
+
+#endif /* _SPL_PROCESSOR_H */
diff --git a/include/spl/sys/random.h b/include/spl/sys/random.h
new file mode 100644
index 000000000..93e244f56
--- /dev/null
+++ b/include/spl/sys/random.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RANDOM_H
+#define _SPL_RANDOM_H
+
+#include <linux/module.h>
+#include <linux/random.h>
+
+static __inline__ int
+random_get_bytes(uint8_t *ptr, size_t len)
+{
+ get_random_bytes((void *)ptr, (int)len);
+ return (0);
+}
+
+extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
+
+#endif /* _SPL_RANDOM_H */
diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h
new file mode 100644
index 000000000..b44ceab66
--- /dev/null
+++ b/include/spl/sys/rwlock.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_RWLOCK_H
+#define _SPL_RWLOCK_H
+
+#include <sys/types.h>
+#include <linux/rwsem.h>
+
+/* Linux kernel compatibility */
+#if defined(CONFIG_PREEMPT_RT_FULL)
+#define SPL_RWSEM_SINGLE_READER_VALUE (1)
+#define SPL_RWSEM_SINGLE_WRITER_VALUE (0)
+#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+#define SPL_RWSEM_SINGLE_READER_VALUE (1)
+#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1)
+#else
+#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS)
+#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS)
+#endif
+
+/* Linux 3.16 changed activity to count for rwsem-spinlock */
+#if defined(CONFIG_PREEMPT_RT_FULL)
+#define RWSEM_COUNT(sem) sem->read_depth
+#elif defined(HAVE_RWSEM_ACTIVITY)
+#define RWSEM_COUNT(sem) sem->activity
+/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */
+#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
+#define RWSEM_COUNT(sem) atomic_long_read(&(sem)->count)
+#else
+#define RWSEM_COUNT(sem) sem->count
+#endif
+
+#if defined(RWSEM_SPINLOCK_IS_RAW)
+#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl)
+#define spl_rwsem_unlock_irqrestore(lk, fl) \
+ raw_spin_unlock_irqrestore(lk, fl)
+#define spl_rwsem_trylock_irqsave(lk, fl) raw_spin_trylock_irqsave(lk, fl)
+#else
+#define spl_rwsem_lock_irqsave(lk, fl) spin_lock_irqsave(lk, fl)
+#define spl_rwsem_unlock_irqrestore(lk, fl) spin_unlock_irqrestore(lk, fl)
+#define spl_rwsem_trylock_irqsave(lk, fl) spin_trylock_irqsave(lk, fl)
+#endif /* RWSEM_SPINLOCK_IS_RAW */
+
+#define spl_rwsem_is_locked(rwsem) rwsem_is_locked(rwsem)
+
+typedef enum {
+ RW_DRIVER = 2,
+ RW_DEFAULT = 4,
+ RW_NOLOCKDEP = 5
+} krw_type_t;
+
+typedef enum {
+ RW_NONE = 0,
+ RW_WRITER = 1,
+ RW_READER = 2
+} krw_t;
+
+/*
+ * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner
+ * field, so we don't need our own.
+ */
+typedef struct {
+ struct rw_semaphore rw_rwlock;
+#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
+ kthread_t *rw_owner;
+#endif
+#ifdef CONFIG_LOCKDEP
+ krw_type_t rw_type;
+#endif /* CONFIG_LOCKDEP */
+} krwlock_t;
+
+#define SEM(rwp) (&(rwp)->rw_rwlock)
+
+static inline void
+spl_rw_set_owner(krwlock_t *rwp)
+{
+/*
+ * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write,
+ * downgrade_write and __init_rwsem will set/clear owner for us.
+ */
+#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
+ rwp->rw_owner = current;
+#endif
+}
+
+static inline void
+spl_rw_clear_owner(krwlock_t *rwp)
+{
+#ifndef CONFIG_RWSEM_SPIN_ON_OWNER
+ rwp->rw_owner = NULL;
+#endif
+}
+
+static inline kthread_t *
+rw_owner(krwlock_t *rwp)
+{
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ return (SEM(rwp)->owner);
+#else
+ return (rwp->rw_owner);
+#endif
+}
+
+#ifdef CONFIG_LOCKDEP
+static inline void
+spl_rw_set_type(krwlock_t *rwp, krw_type_t type)
+{
+ rwp->rw_type = type;
+}
+static inline void
+spl_rw_lockdep_off_maybe(krwlock_t *rwp) \
+{ \
+ if (rwp && rwp->rw_type == RW_NOLOCKDEP) \
+ lockdep_off(); \
+}
+static inline void
+spl_rw_lockdep_on_maybe(krwlock_t *rwp) \
+{ \
+ if (rwp && rwp->rw_type == RW_NOLOCKDEP) \
+ lockdep_on(); \
+}
+#else /* CONFIG_LOCKDEP */
+#define spl_rw_set_type(rwp, type)
+#define spl_rw_lockdep_off_maybe(rwp)
+#define spl_rw_lockdep_on_maybe(rwp)
+#endif /* CONFIG_LOCKDEP */
+
+static inline int
+RW_READ_HELD(krwlock_t *rwp)
+{
+ /*
+ * Linux 4.8 will set owner to 1 when read held instead of leave it
+ * NULL. So we check whether owner <= 1.
+ */
+ return (spl_rwsem_is_locked(SEM(rwp)) &&
+ (unsigned long)rw_owner(rwp) <= 1);
+}
+
+static inline int
+RW_WRITE_HELD(krwlock_t *rwp)
+{
+ return (rw_owner(rwp) == current);
+}
+
+static inline int
+RW_LOCK_HELD(krwlock_t *rwp)
+{
+ return (spl_rwsem_is_locked(SEM(rwp)));
+}
+
+/*
+ * The following functions must be a #define and not static inline.
+ * This ensures that the native linux semaphore functions (down/up)
+ * will be correctly located in the users code which is important
+ * for the built in kernel lock analysis tools
+ */
+/* BEGIN CSTYLED */
+#define rw_init(rwp, name, type, arg) \
+({ \
+ static struct lock_class_key __key; \
+ ASSERT(type == RW_DEFAULT || type == RW_NOLOCKDEP); \
+ \
+ __init_rwsem(SEM(rwp), #rwp, &__key); \
+ spl_rw_clear_owner(rwp); \
+ spl_rw_set_type(rwp, type); \
+})
+
+#define rw_destroy(rwp) \
+({ \
+ VERIFY(!RW_LOCK_HELD(rwp)); \
+})
+
+#define rw_tryenter(rwp, rw) \
+({ \
+ int _rc_ = 0; \
+ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ switch (rw) { \
+ case RW_READER: \
+ _rc_ = down_read_trylock(SEM(rwp)); \
+ break; \
+ case RW_WRITER: \
+ if ((_rc_ = down_write_trylock(SEM(rwp)))) \
+ spl_rw_set_owner(rwp); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+ spl_rw_lockdep_on_maybe(rwp); \
+ _rc_; \
+})
+
+#define rw_enter(rwp, rw) \
+({ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ switch (rw) { \
+ case RW_READER: \
+ down_read(SEM(rwp)); \
+ break; \
+ case RW_WRITER: \
+ down_write(SEM(rwp)); \
+ spl_rw_set_owner(rwp); \
+ break; \
+ default: \
+ VERIFY(0); \
+ } \
+ spl_rw_lockdep_on_maybe(rwp); \
+})
+
+#define rw_exit(rwp) \
+({ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ if (RW_WRITE_HELD(rwp)) { \
+ spl_rw_clear_owner(rwp); \
+ up_write(SEM(rwp)); \
+ } else { \
+ ASSERT(RW_READ_HELD(rwp)); \
+ up_read(SEM(rwp)); \
+ } \
+ spl_rw_lockdep_on_maybe(rwp); \
+})
+
+#define rw_downgrade(rwp) \
+({ \
+ spl_rw_lockdep_off_maybe(rwp); \
+ spl_rw_clear_owner(rwp); \
+ downgrade_write(SEM(rwp)); \
+ spl_rw_lockdep_on_maybe(rwp); \
+})
+
+#define rw_tryupgrade(rwp) \
+({ \
+ int _rc_ = 0; \
+ \
+ if (RW_WRITE_HELD(rwp)) { \
+ _rc_ = 1; \
+ } else { \
+ spl_rw_lockdep_off_maybe(rwp); \
+ if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \
+ spl_rw_set_owner(rwp); \
+ spl_rw_lockdep_on_maybe(rwp); \
+ } \
+ _rc_; \
+})
+/* END CSTYLED */
+
+int spl_rw_init(void);
+void spl_rw_fini(void);
+int rwsem_tryupgrade(struct rw_semaphore *rwsem);
+
+#endif /* _SPL_RWLOCK_H */
diff --git a/include/spl/sys/shrinker.h b/include/spl/sys/shrinker.h
new file mode 100644
index 000000000..28c1fa78c
--- /dev/null
+++ b/include/spl/sys/shrinker.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SHRINKER_H
+#define _SPL_SHRINKER_H
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+
+#if !defined(HAVE_SHRINK_CONTROL_STRUCT)
+struct shrink_control {
+ gfp_t gfp_mask;
+ unsigned long nr_to_scan;
+};
+#endif /* HAVE_SHRINK_CONTROL_STRUCT */
+
+/*
+ * Due to frequent changes in the shrinker API the following
+ * compatibility wrappers should be used. They are as follows:
+ *
+ * SPL_SHRINKER_DECLARE is used to declare the shrinker which is
+ * passed to spl_register_shrinker()/spl_unregister_shrinker(). Use
+ * shrinker_name to set the shrinker variable name, shrinker_callback
+ * to set the callback function, and seek_cost to define the cost of
+ * reclaiming an object.
+ *
+ * SPL_SHRINKER_DECLARE(shrinker_name, shrinker_callback, seek_cost);
+ *
+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE is used when a forward declaration
+ * of the shrinker callback function is required. Only the callback
+ * function needs to be passed.
+ *
+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(shrinker_callback);
+ *
+ * SPL_SHRINKER_CALLBACK_WRAPPER is used to declare the callback function
+ * which is registered with the shrinker. This function will call your
+ * custom shrinker which must use the following prototype. Notice the
+ * leading __'s, these must be appended to the callback_function name.
+ *
+ * int __shrinker_callback(struct shrinker *, struct shrink_control *)
+ * SPL_SHRINKER_CALLBACK_WRAPPER(shrinker_callback);a
+ *
+ *
+ * Example:
+ *
+ * SPL_SHRINKER_CALLBACK_FWD_DECLARE(my_shrinker_fn);
+ * SPL_SHRINKER_DECLARE(my_shrinker, my_shrinker_fn, 1);
+ *
+ * static int
+ * __my_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc)
+ * {
+ * if (sc->nr_to_scan) {
+ * ...scan objects in the cache and reclaim them...
+ * }
+ *
+ * ...calculate number of objects in the cache...
+ *
+ * return (number of objects in the cache);
+ * }
+ * SPL_SHRINKER_CALLBACK_WRAPPER(my_shrinker_fn);
+ */
+
+#define spl_register_shrinker(x) register_shrinker(x)
+#define spl_unregister_shrinker(x) unregister_shrinker(x)
+
+/*
+ * Linux 2.6.23 - 2.6.34 Shrinker API Compatibility.
+ */
+#if defined(HAVE_2ARGS_OLD_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .shrink = x, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static int fn(int nr_to_scan, unsigned int gfp_mask)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static int \
+fn(int nr_to_scan, unsigned int gfp_mask) \
+{ \
+ struct shrink_control sc; \
+ \
+ sc.nr_to_scan = nr_to_scan; \
+ sc.gfp_mask = gfp_mask; \
+ \
+ return (__ ## fn(NULL, &sc)); \
+}
+
+/*
+ * Linux 2.6.35 to 2.6.39 Shrinker API Compatibility.
+ */
+#elif defined(HAVE_3ARGS_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .shrink = x, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static int fn(struct shrinker *, int, unsigned int)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static int \
+fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \
+{ \
+ struct shrink_control sc; \
+ \
+ sc.nr_to_scan = nr_to_scan; \
+ sc.gfp_mask = gfp_mask; \
+ \
+ return (__ ## fn(shrink, &sc)); \
+}
+
+/*
+ * Linux 3.0 to 3.11 Shrinker API Compatibility.
+ */
+#elif defined(HAVE_2ARGS_NEW_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .shrink = x, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static int fn(struct shrinker *, struct shrink_control *)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static int \
+fn(struct shrinker *shrink, struct shrink_control *sc) \
+{ \
+ return (__ ## fn(shrink, sc)); \
+}
+
+/*
+ * Linux 3.12 and later Shrinker API Compatibility.
+ */
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+#define SPL_SHRINKER_DECLARE(s, x, y) \
+static struct shrinker s = { \
+ .count_objects = x ## _count_objects, \
+ .scan_objects = x ## _scan_objects, \
+ .seeks = y \
+}
+
+#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \
+static unsigned long fn ## _count_objects(struct shrinker *, \
+ struct shrink_control *); \
+static unsigned long fn ## _scan_objects(struct shrinker *, \
+ struct shrink_control *)
+
+#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \
+static unsigned long \
+fn ## _count_objects(struct shrinker *shrink, struct shrink_control *sc)\
+{ \
+ int __ret__; \
+ \
+ sc->nr_to_scan = 0; \
+ __ret__ = __ ## fn(NULL, sc); \
+ \
+ /* Errors may not be returned and must be converted to zeros */ \
+ return ((__ret__ < 0) ? 0 : __ret__); \
+} \
+ \
+static unsigned long \
+fn ## _scan_objects(struct shrinker *shrink, struct shrink_control *sc) \
+{ \
+ int __ret__; \
+ \
+ __ret__ = __ ## fn(NULL, sc); \
+ return ((__ret__ < 0) ? SHRINK_STOP : __ret__); \
+}
+#else
+/*
+ * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced.
+ */
+#error "Unknown shrinker callback"
+#endif
+
+#if defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+typedef unsigned long spl_shrinker_t;
+#else
+typedef int spl_shrinker_t;
+#define SHRINK_STOP (-1)
+#endif
+
+#endif /* SPL_SHRINKER_H */
diff --git a/include/spl/sys/sid.h b/include/spl/sys/sid.h
new file mode 100644
index 000000000..731b62c47
--- /dev/null
+++ b/include/spl/sys/sid.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SID_H
+#define _SPL_SID_H
+
+typedef struct ksiddomain {
+ char *kd_name;
+} ksiddomain_t;
+
+typedef enum ksid_index {
+ KSID_USER,
+ KSID_GROUP,
+ KSID_OWNER,
+ KSID_COUNT
+} ksid_index_t;
+
+typedef int ksid_t;
+
+static inline ksiddomain_t *
+ksid_lookupdomain(const char *dom)
+{
+ ksiddomain_t *kd;
+ int len = strlen(dom);
+
+ kd = kmem_zalloc(sizeof (ksiddomain_t), KM_SLEEP);
+ kd->kd_name = kmem_zalloc(len + 1, KM_SLEEP);
+ memcpy(kd->kd_name, dom, len);
+
+ return (kd);
+}
+
+static inline void
+ksiddomain_rele(ksiddomain_t *ksid)
+{
+ kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1);
+ kmem_free(ksid, sizeof (ksiddomain_t));
+}
+
+#endif /* _SPL_SID_H */
diff --git a/include/spl/sys/signal.h b/include/spl/sys/signal.h
new file mode 100644
index 000000000..36b8b5d98
--- /dev/null
+++ b/include/spl/sys/signal.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SIGNAL_H
+#define _SPL_SIGNAL_H
+
+#include <linux/sched.h>
+
+#ifdef HAVE_SCHED_SIGNAL_HEADER
+#include <linux/sched/signal.h>
+#endif
+
+#define FORREAL 0 /* Usual side-effects */
+#define JUSTLOOKING 1 /* Don't stop the process */
+
+/*
+ * The "why" argument indicates the allowable side-effects of the call:
+ *
+ * FORREAL: Extract the next pending signal from p_sig into p_cursig;
+ * stop the process if a stop has been requested or if a traced signal
+ * is pending.
+ *
+ * JUSTLOOKING: Don't stop the process, just indicate whether or not
+ * a signal might be pending (FORREAL is needed to tell for sure).
+ */
+static __inline__ int
+issig(int why)
+{
+ ASSERT(why == FORREAL || why == JUSTLOOKING);
+
+ return (signal_pending(current));
+}
+
+#endif /* SPL_SIGNAL_H */
diff --git a/include/spl/sys/stat.h b/include/spl/sys/stat.h
new file mode 100644
index 000000000..83018e894
--- /dev/null
+++ b/include/spl/sys/stat.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_STAT_H
+#define _SPL_STAT_H
+
+#include <linux/stat.h>
+
+#endif /* SPL_STAT_H */
diff --git a/include/spl/sys/strings.h b/include/spl/sys/strings.h
new file mode 100644
index 000000000..4fb803206
--- /dev/null
+++ b/include/spl/sys/strings.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2018 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef _SPL_SYS_STRINGS_H
+#define _SPL_SYS_STRINGS_H
+
+#include <linux/string.h>
+
+#define bzero(ptr, size) memset(ptr, 0, size)
+#define bcopy(src, dest, size) memmove(dest, src, size)
+#define bcmp(src, dest, size) memcmp((src), (dest), (size_t)(size))
+
+#endif /* _SPL_SYS_STRINGS_H */
diff --git a/include/spl/sys/sunddi.h b/include/spl/sys/sunddi.h
new file mode 100644
index 000000000..29a6fe00d
--- /dev/null
+++ b/include/spl/sys/sunddi.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SUNDDI_H
+#define _SPL_SUNDDI_H
+
+#include <sys/cred.h>
+#include <sys/uio.h>
+#include <sys/mutex.h>
+#include <sys/u8_textprep.h>
+#include <sys/vnode.h>
+
+typedef int ddi_devid_t;
+
+#define DDI_DEV_T_NONE ((dev_t)-1)
+#define DDI_DEV_T_ANY ((dev_t)-2)
+#define DI_MAJOR_T_UNKNOWN ((major_t)0)
+
+#define DDI_PROP_DONTPASS 0x0001
+#define DDI_PROP_CANSLEEP 0x0002
+
+#define DDI_SUCCESS 0
+#define DDI_FAILURE -1
+
+#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL)
+#define ddi_prop_free(x) (void)0
+#define ddi_root_node() (void)0
+
+extern int ddi_strtoul(const char *, char **, int, unsigned long *);
+extern int ddi_strtol(const char *, char **, int, long *);
+extern int ddi_strtoull(const char *, char **, int, unsigned long long *);
+extern int ddi_strtoll(const char *, char **, int, long long *);
+
+extern int ddi_copyin(const void *from, void *to, size_t len, int flags);
+extern int ddi_copyout(const void *from, void *to, size_t len, int flags);
+
+#endif /* SPL_SUNDDI_H */
diff --git a/include/spl/sys/sysmacros.h b/include/spl/sys/sysmacros.h
new file mode 100644
index 000000000..839e7fd8c
--- /dev/null
+++ b/include/spl/sys/sysmacros.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SYSMACROS_H
+#define _SPL_SYSMACROS_H
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <sys/debug.h>
+#include <sys/zone.h>
+#include <sys/signal.h>
+#include <asm/page.h>
+
+#ifdef HAVE_SCHED_RT_HEADER
+#include <linux/sched/rt.h>
+#endif
+
+#ifndef _KERNEL
+#define _KERNEL __KERNEL__
+#endif
+
+#define FALSE 0
+#define TRUE 1
+
+#define INT8_MAX (127)
+#define INT8_MIN (-128)
+#define UINT8_MAX (255)
+#define UINT8_MIN (0)
+
+#define INT16_MAX (32767)
+#define INT16_MIN (-32768)
+#define UINT16_MAX (65535)
+#define UINT16_MIN (0)
+
+#define INT32_MAX INT_MAX
+#define INT32_MIN INT_MIN
+#define UINT32_MAX UINT_MAX
+#define UINT32_MIN UINT_MIN
+
+#define INT64_MAX LLONG_MAX
+#define INT64_MIN LLONG_MIN
+#define UINT64_MAX ULLONG_MAX
+#define UINT64_MIN ULLONG_MIN
+
+#define NBBY 8
+
+#define MAXMSGLEN 256
+#define MAXNAMELEN 256
+#define MAXPATHLEN 4096
+#define MAXOFFSET_T LLONG_MAX
+#define MAXBSIZE 8192
+#define DEV_BSIZE 512
+#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */
+
+#define proc_pageout NULL
+#define curproc current
+#define max_ncpus num_possible_cpus()
+#define boot_ncpus num_online_cpus()
+#define CPU_SEQID smp_processor_id()
+#define _NOTE(x)
+#define is_system_labeled() 0
+
+#ifndef RLIM64_INFINITY
+#define RLIM64_INFINITY (~0ULL)
+#endif
+
+/*
+ * 0..MAX_PRIO-1: Process priority
+ * 0..MAX_RT_PRIO-1: RT priority tasks
+ * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks
+ *
+ * Treat shim tasks as SCHED_NORMAL tasks
+ */
+#define minclsyspri (MAX_PRIO-1)
+#define maxclsyspri (MAX_RT_PRIO)
+#define defclsyspri (DEFAULT_PRIO)
+
+#ifndef NICE_TO_PRIO
+#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
+#endif
+#ifndef PRIO_TO_NICE
+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
+#endif
+
+/*
+ * Missing macros
+ */
+#ifndef PAGESIZE
+#define PAGESIZE PAGE_SIZE
+#endif
+
+#ifndef PAGESHIFT
+#define PAGESHIFT PAGE_SHIFT
+#endif
+
+/* Dtrace probes do not exist in the linux kernel */
+#ifdef DTRACE_PROBE
+#undef DTRACE_PROBE
+#endif /* DTRACE_PROBE */
+#define DTRACE_PROBE(a) ((void)0)
+
+#ifdef DTRACE_PROBE1
+#undef DTRACE_PROBE1
+#endif /* DTRACE_PROBE1 */
+#define DTRACE_PROBE1(a, b, c) ((void)0)
+
+#ifdef DTRACE_PROBE2
+#undef DTRACE_PROBE2
+#endif /* DTRACE_PROBE2 */
+#define DTRACE_PROBE2(a, b, c, d, e) ((void)0)
+
+#ifdef DTRACE_PROBE3
+#undef DTRACE_PROBE3
+#endif /* DTRACE_PROBE3 */
+#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0)
+
+#ifdef DTRACE_PROBE4
+#undef DTRACE_PROBE4
+#endif /* DTRACE_PROBE4 */
+#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0)
+
+/* Missing globals */
+extern char spl_version[32];
+extern unsigned long spl_hostid;
+
+/* Missing misc functions */
+extern uint32_t zone_get_hostid(void *zone);
+extern void spl_setup(void);
+extern void spl_cleanup(void);
+
+#define highbit(x) __fls(x)
+#define lowbit(x) __ffs(x)
+
+#define highbit64(x) fls64(x)
+#define makedevice(maj, min) makedev(maj, min)
+
+/* common macros */
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) < (b) ? (b) : (a))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) < 0 ? -(a) : (a))
+#endif
+#ifndef DIV_ROUND_UP
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+#endif
+#ifndef roundup
+#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y))
+#endif
+#ifndef howmany
+#define howmany(x, y) (((x) + ((y) - 1)) / (y))
+#endif
+
+/*
+ * Compatibility macros/typedefs needed for Solaris -> Linux port
+ */
+#define P2ALIGN(x, align) ((x) & -(align))
+#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1)
+#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1)
+#define P2PHASE(x, align) ((x) & ((align) - 1))
+#define P2NPHASE(x, align) (-(x) & ((align) - 1))
+#define ISP2(x) (((x) & ((x) - 1)) == 0)
+#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
+#define P2BOUNDARY(off, len, align) \
+ (((off) ^ ((off) + (len) - 1)) > (align) - 1)
+
+/*
+ * Typed version of the P2* macros. These macros should be used to ensure
+ * that the result is correctly calculated based on the data type of (x),
+ * which is passed in as the last argument, regardless of the data
+ * type of the alignment. For example, if (x) is of type uint64_t,
+ * and we want to round it up to a page boundary using "PAGESIZE" as
+ * the alignment, we can do either
+ *
+ * P2ROUNDUP(x, (uint64_t)PAGESIZE)
+ * or
+ * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t)
+ */
+#define P2ALIGN_TYPED(x, align, type) \
+ ((type)(x) & -(type)(align))
+#define P2PHASE_TYPED(x, align, type) \
+ ((type)(x) & ((type)(align) - 1))
+#define P2NPHASE_TYPED(x, align, type) \
+ (-(type)(x) & ((type)(align) - 1))
+#define P2ROUNDUP_TYPED(x, align, type) \
+ ((((type)(x) - 1) | ((type)(align) - 1)) + 1)
+#define P2END_TYPED(x, align, type) \
+ (-(~(type)(x) & -(type)(align)))
+#define P2PHASEUP_TYPED(x, align, phase, type) \
+ ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align)))
+#define P2CROSS_TYPED(x, y, align, type) \
+ (((type)(x) ^ (type)(y)) > (type)(align) - 1)
+#define P2SAMEHIGHBIT_TYPED(x, y, type) \
+ (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y)))
+
+#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof)
+
+/* avoid any possibility of clashing with <stddef.h> version */
+
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+
+#endif /* _SPL_SYSMACROS_H */
diff --git a/include/spl/sys/systeminfo.h b/include/spl/sys/systeminfo.h
new file mode 100644
index 000000000..225569158
--- /dev/null
+++ b/include/spl/sys/systeminfo.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_SYSTEMINFO_H
+#define _SPL_SYSTEMINFO_H
+
+#define HW_HOSTID_LEN 11 /* minimum buffer size needed */
+ /* to hold a decimal or hex */
+ /* hostid string */
+
+/* Supplemental definitions for Linux. */
+#define HW_HOSTID_PATH "/etc/hostid" /* binary configuration file */
+#define HW_HOSTID_MASK 0xFFFFFFFF /* significant hostid bits */
+
+#endif /* SPL_SYSTEMINFO_H */
diff --git a/include/spl/sys/taskq.h b/include/spl/sys/taskq.h
new file mode 100644
index 000000000..7353367a2
--- /dev/null
+++ b/include/spl/sys/taskq.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TASKQ_H
+#define _SPL_TASKQ_H
+
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/rwlock.h>
+#include <sys/wait.h>
+
+#define TASKQ_NAMELEN 31
+
+#define TASKQ_PREPOPULATE 0x00000001
+#define TASKQ_CPR_SAFE 0x00000002
+#define TASKQ_DYNAMIC 0x00000004
+#define TASKQ_THREADS_CPU_PCT 0x00000008
+#define TASKQ_DC_BATCH 0x00000010
+#define TASKQ_ACTIVE 0x80000000
+
+/*
+ * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
+ * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly
+ * large so as not to conflict with already used GFP_* defines.
+ */
+#define TQ_SLEEP 0x00000000
+#define TQ_NOSLEEP 0x00000001
+#define TQ_PUSHPAGE 0x00000002
+#define TQ_NOQUEUE 0x01000000
+#define TQ_NOALLOC 0x02000000
+#define TQ_NEW 0x04000000
+#define TQ_FRONT 0x08000000
+
+/*
+ * Reserved taskqid values.
+ */
+#define TASKQID_INVALID ((taskqid_t)0)
+#define TASKQID_INITIAL ((taskqid_t)1)
+
+/*
+ * spin_lock(lock) and spin_lock_nested(lock,0) are equivalent,
+ * so TQ_LOCK_DYNAMIC must not evaluate to 0
+ */
+typedef enum tq_lock_role {
+ TQ_LOCK_GENERAL = 0,
+ TQ_LOCK_DYNAMIC = 1,
+} tq_lock_role_t;
+
+typedef unsigned long taskqid_t;
+typedef void (task_func_t)(void *);
+
+typedef struct taskq {
+ spinlock_t tq_lock; /* protects taskq_t */
+ char *tq_name; /* taskq name */
+ int tq_instance; /* instance of tq_name */
+ struct list_head tq_thread_list; /* list of all threads */
+ struct list_head tq_active_list; /* list of active threads */
+ int tq_nactive; /* # of active threads */
+ int tq_nthreads; /* # of existing threads */
+ int tq_nspawn; /* # of threads being spawned */
+ int tq_maxthreads; /* # of threads maximum */
+ int tq_pri; /* priority */
+ int tq_minalloc; /* min taskq_ent_t pool size */
+ int tq_maxalloc; /* max taskq_ent_t pool size */
+ int tq_nalloc; /* cur taskq_ent_t pool size */
+ uint_t tq_flags; /* flags */
+ taskqid_t tq_next_id; /* next pend/work id */
+ taskqid_t tq_lowest_id; /* lowest pend/work id */
+ struct list_head tq_free_list; /* free taskq_ent_t's */
+ struct list_head tq_pend_list; /* pending taskq_ent_t's */
+ struct list_head tq_prio_list; /* priority taskq_ent_t's */
+ struct list_head tq_delay_list; /* delayed taskq_ent_t's */
+ struct list_head tq_taskqs; /* all taskq_t's */
+ spl_wait_queue_head_t tq_work_waitq; /* new work waitq */
+ spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */
+ tq_lock_role_t tq_lock_class; /* class when taking tq_lock */
+} taskq_t;
+
+typedef struct taskq_ent {
+ spinlock_t tqent_lock;
+ spl_wait_queue_head_t tqent_waitq;
+ struct timer_list tqent_timer;
+ struct list_head tqent_list;
+ taskqid_t tqent_id;
+ task_func_t *tqent_func;
+ void *tqent_arg;
+ taskq_t *tqent_taskq;
+ uintptr_t tqent_flags;
+ unsigned long tqent_birth;
+} taskq_ent_t;
+
+#define TQENT_FLAG_PREALLOC 0x1
+#define TQENT_FLAG_CANCEL 0x2
+
+typedef struct taskq_thread {
+ struct list_head tqt_thread_list;
+ struct list_head tqt_active_list;
+ struct task_struct *tqt_thread;
+ taskq_t *tqt_tq;
+ taskqid_t tqt_id;
+ taskq_ent_t *tqt_task;
+ uintptr_t tqt_flags;
+} taskq_thread_t;
+
+/* Global system-wide dynamic task queue available for all consumers */
+extern taskq_t *system_taskq;
+/* Global dynamic task queue for long delay */
+extern taskq_t *system_delay_taskq;
+
+/* List of all taskqs */
+extern struct list_head tq_list;
+extern struct rw_semaphore tq_list_sem;
+
+extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *,
+ uint_t, clock_t);
+extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+ taskq_ent_t *);
+extern int taskq_empty_ent(taskq_ent_t *);
+extern void taskq_init_ent(taskq_ent_t *);
+extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+extern void taskq_destroy(taskq_t *);
+extern void taskq_wait_id(taskq_t *, taskqid_t);
+extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
+extern void taskq_wait(taskq_t *);
+extern int taskq_cancel_id(taskq_t *, taskqid_t);
+extern int taskq_member(taskq_t *, kthread_t *);
+
+#define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \
+ taskq_create(name, nthreads, pri, min, max, flags)
+#define taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \
+ taskq_create(name, nthreads, maxclsyspri, min, max, flags)
+
+int spl_taskq_init(void);
+void spl_taskq_fini(void);
+
+#endif /* _SPL_TASKQ_H */
diff --git a/include/spl/sys/thread.h b/include/spl/sys/thread.h
new file mode 100644
index 000000000..3762717da
--- /dev/null
+++ b/include/spl/sys/thread.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_THREAD_H
+#define _SPL_THREAD_H
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+#define TP_MAGIC 0x53535353
+
+#define TS_SLEEP TASK_INTERRUPTIBLE
+#define TS_RUN TASK_RUNNING
+#define TS_ZOMB EXIT_ZOMBIE
+#define TS_STOPPED TASK_STOPPED
+
+typedef void (*thread_func_t)(void *);
+
+/* BEGIN CSTYLED */
+#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \
+ __thread_create(stk, stksize, (thread_func_t)func, \
+ #func, arg, len, pp, state, pri)
+/* END CSTYLED */
+
+#define thread_exit() __thread_exit()
+#define thread_join(t) VERIFY(0)
+#define curthread current
+#define getcomm() current->comm
+#define getpid() current->pid
+
+extern kthread_t *__thread_create(caddr_t stk, size_t stksize,
+ thread_func_t func, const char *name, void *args, size_t len, proc_t *pp,
+ int state, pri_t pri);
+extern void __thread_exit(void);
+extern struct task_struct *spl_kthread_create(int (*func)(void *),
+ void *data, const char namefmt[], ...);
+
+extern proc_t p0;
+
+#endif /* _SPL_THREAD_H */
diff --git a/include/spl/sys/time.h b/include/spl/sys/time.h
new file mode 100644
index 000000000..d6aaca913
--- /dev/null
+++ b/include/spl/sys/time.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TIME_H
+#define _SPL_TIME_H
+
+#include <linux/module.h>
+#include <linux/time.h>
+#include <sys/types.h>
+#include <sys/timer.h>
+
+#if defined(CONFIG_64BIT)
+#define TIME_MAX INT64_MAX
+#define TIME_MIN INT64_MIN
+#else
+#define TIME_MAX INT32_MAX
+#define TIME_MIN INT32_MIN
+#endif
+
+#define SEC 1
+#define MILLISEC 1000
+#define MICROSEC 1000000
+#define NANOSEC 1000000000
+
+#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
+#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
+
+#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
+#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
+
+#define NSEC2SEC(n) ((n) / (NANOSEC / SEC))
+#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC))
+
+static const int hz = HZ;
+
+#define TIMESPEC_OVERFLOW(ts) \
+ ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX)
+
+static inline void
+gethrestime(timestruc_t *now)
+{
+ *now = current_kernel_time();
+}
+
+static inline time_t
+gethrestime_sec(void)
+{
+ struct timespec ts;
+ ts = current_kernel_time();
+ return (ts.tv_sec);
+}
+
+static inline hrtime_t
+gethrtime(void)
+{
+ struct timespec now;
+ getrawmonotonic(&now);
+ return (((hrtime_t)now.tv_sec * NSEC_PER_SEC) + now.tv_nsec);
+}
+
+#endif /* _SPL_TIME_H */
diff --git a/include/spl/sys/timer.h b/include/spl/sys/timer.h
new file mode 100644
index 000000000..a6b134570
--- /dev/null
+++ b/include/spl/sys/timer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TIMER_H
+#define _SPL_TIMER_H
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/time.h>
+#include <linux/timer.h>
+
+#define lbolt ((clock_t)jiffies)
+#define lbolt64 ((int64_t)get_jiffies_64())
+
+#define ddi_get_lbolt() ((clock_t)jiffies)
+#define ddi_get_lbolt64() ((int64_t)get_jiffies_64())
+
+#define ddi_time_before(a, b) (typecheck(clock_t, a) && \
+ typecheck(clock_t, b) && \
+ ((a) - (b) < 0))
+#define ddi_time_after(a, b) ddi_time_before(b, a)
+#define ddi_time_before_eq(a, b) (!ddi_time_after(a, b))
+#define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a)
+
+#define ddi_time_before64(a, b) (typecheck(int64_t, a) && \
+ typecheck(int64_t, b) && \
+ ((a) - (b) < 0))
+#define ddi_time_after64(a, b) ddi_time_before64(b, a)
+#define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b))
+#define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a)
+
+#define delay(ticks) schedule_timeout_uninterruptible(ticks)
+
+/* usleep_range() introduced in 2.6.36 */
+#ifndef HAVE_USLEEP_RANGE
+static inline void
+usleep_range(unsigned long min, unsigned long max)
+{
+ unsigned int min_ms = min / USEC_PER_MSEC;
+
+ if (min >= MAX_UDELAY_MS)
+ msleep(min_ms);
+ else
+ udelay(min);
+}
+#endif /* HAVE_USLEEP_RANGE */
+
+#define SEC_TO_TICK(sec) ((sec) * HZ)
+#define MSEC_TO_TICK(ms) msecs_to_jiffies(ms)
+#define USEC_TO_TICK(us) usecs_to_jiffies(us)
+#define NSEC_TO_TICK(ns) usecs_to_jiffies(ns / NSEC_PER_USEC)
+
+#endif /* _SPL_TIMER_H */
diff --git a/include/spl/sys/tsd.h b/include/spl/sys/tsd.h
new file mode 100644
index 000000000..39a291bf3
--- /dev/null
+++ b/include/spl/sys/tsd.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TSD_H
+#define _SPL_TSD_H
+
+#include <sys/types.h>
+
+#define TSD_HASH_TABLE_BITS_DEFAULT 9
+#define TSD_KEYS_MAX 32768
+#define DTOR_PID (PID_MAX_LIMIT+1)
+#define PID_KEY (TSD_KEYS_MAX+1)
+
+typedef void (*dtor_func_t)(void *);
+
+extern int tsd_set(uint_t, void *);
+extern void *tsd_get(uint_t);
+extern void *tsd_get_by_thread(uint_t, kthread_t *);
+extern void tsd_create(uint_t *, dtor_func_t);
+extern void tsd_destroy(uint_t *);
+extern void tsd_exit(void);
+
+int spl_tsd_init(void);
+void spl_tsd_fini(void);
+
+#endif /* _SPL_TSD_H */
diff --git a/include/spl/sys/types.h b/include/spl/sys/types.h
new file mode 100644
index 000000000..a5b478127
--- /dev/null
+++ b/include/spl/sys/types.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TYPES_H
+#define _SPL_TYPES_H
+
+#include <linux/types.h>
+
+#ifndef ULLONG_MAX
+#define ULLONG_MAX (~0ULL)
+#endif
+
+#ifndef LLONG_MAX
+#define LLONG_MAX ((long long)(~0ULL>>1))
+#endif
+
+typedef enum {
+ B_FALSE = 0,
+ B_TRUE = 1
+} boolean_t;
+
+typedef unsigned char uchar_t;
+typedef unsigned short ushort_t;
+typedef unsigned int uint_t;
+typedef unsigned long ulong_t;
+typedef unsigned long long u_longlong_t;
+typedef long long longlong_t;
+
+typedef unsigned long intptr_t;
+typedef unsigned long long rlim64_t;
+
+typedef struct task_struct kthread_t;
+typedef struct task_struct proc_t;
+
+typedef struct timespec timestruc_t;
+typedef struct timespec timespec_t;
+typedef longlong_t hrtime_t;
+
+typedef int id_t;
+typedef short pri_t;
+typedef short index_t;
+typedef longlong_t offset_t;
+typedef u_longlong_t u_offset_t;
+typedef ulong_t pgcnt_t;
+
+typedef int major_t;
+typedef int minor_t;
+
+#endif /* _SPL_TYPES_H */
diff --git a/include/spl/sys/types32.h b/include/spl/sys/types32.h
new file mode 100644
index 000000000..c60ba8c97
--- /dev/null
+++ b/include/spl/sys/types32.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_TYPES32_H
+#define _SPL_TYPES32_H
+
+#include <sys/types.h>
+
+typedef uint32_t caddr32_t;
+typedef int32_t daddr32_t;
+typedef int32_t time32_t;
+typedef uint32_t size32_t;
+
+#endif /* _SPL_TYPES32_H */
diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h
new file mode 100644
index 000000000..64c452b8d
--- /dev/null
+++ b/include/spl/sys/uio.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_UIO_H
+#define _SPL_UIO_H
+
+#include <linux/uio.h>
+#include <linux/blkdev.h>
+#include <asm/uaccess.h>
+#include <sys/types.h>
+
+typedef struct iovec iovec_t;
+
+typedef enum uio_rw {
+ UIO_READ = 0,
+ UIO_WRITE = 1,
+} uio_rw_t;
+
+typedef enum uio_seg {
+ UIO_USERSPACE = 0,
+ UIO_SYSSPACE = 1,
+ UIO_USERISPACE = 2,
+ UIO_BVEC = 3,
+} uio_seg_t;
+
+typedef struct uio {
+ union {
+ const struct iovec *uio_iov;
+ const struct bio_vec *uio_bvec;
+ };
+ int uio_iovcnt;
+ offset_t uio_loffset;
+ uio_seg_t uio_segflg;
+ uint16_t uio_fmode;
+ uint16_t uio_extflg;
+ offset_t uio_limit;
+ ssize_t uio_resid;
+ size_t uio_skip;
+} uio_t;
+
+typedef struct aio_req {
+ uio_t *aio_uio;
+ void *aio_private;
+} aio_req_t;
+
+typedef enum xuio_type {
+ UIOTYPE_ASYNCIO,
+ UIOTYPE_ZEROCOPY,
+} xuio_type_t;
+
+
+#define UIOA_IOV_MAX 16
+
+typedef struct uioa_page_s {
+ int uioa_pfncnt;
+ void **uioa_ppp;
+ caddr_t uioa_base;
+ size_t uioa_len;
+} uioa_page_t;
+
+typedef struct xuio {
+ uio_t xu_uio;
+ enum xuio_type xu_type;
+ union {
+ struct {
+ uint32_t xu_a_state;
+ ssize_t xu_a_mbytes;
+ uioa_page_t *xu_a_lcur;
+ void **xu_a_lppp;
+ void *xu_a_hwst[4];
+ uioa_page_t xu_a_locked[UIOA_IOV_MAX];
+ } xu_aio;
+
+ struct {
+ int xu_zc_rw;
+ void *xu_zc_priv;
+ } xu_zc;
+ } xu_ext;
+} xuio_t;
+
+#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv
+#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw
+
+#endif /* SPL_UIO_H */
diff --git a/include/spl/sys/user.h b/include/spl/sys/user.h
new file mode 100644
index 000000000..b12cb240e
--- /dev/null
+++ b/include/spl/sys/user.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2015 Cluster Inc.
+ * Produced at ClusterHQ Inc (cf, DISCLAIMER).
+ * Written by Richard Yao <richard.yao@clusterhq.com>.
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_USER_H
+#define _SPL_USER_H
+
+/*
+ * We have uf_info_t for areleasef(). We implement areleasef() using a global
+ * linked list of all open file descriptors with the task structs referenced,
+ * so accessing the correct descriptor from areleasef() only requires knowing
+ * about the Linux task_struct. Since this is internal to our compatibility
+ * layer, we make it an opaque type.
+ *
+ * XXX: If the descriptor changes under us and we do not do a getf() between
+ * the change and using it, we would get an incorrect reference.
+ */
+
+struct uf_info;
+typedef struct uf_info uf_info_t;
+
+#define P_FINFO(x) ((uf_info_t *)x)
+
+#endif /* SPL_USER_H */
diff --git a/include/spl/sys/vfs.h b/include/spl/sys/vfs.h
new file mode 100644
index 000000000..0d5e1d51d
--- /dev/null
+++ b/include/spl/sys/vfs.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ZFS_H
+#define _SPL_ZFS_H
+
+#include <linux/mount.h>
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/statfs.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+#include <linux/seq_file.h>
+
+#define MAXFIDSZ 64
+
+typedef struct spl_fid {
+ union {
+ long fid_pad;
+ struct {
+ ushort_t len; /* length of data in bytes */
+ char data[MAXFIDSZ]; /* data (variable len) */
+ } _fid;
+ } un;
+} fid_t;
+
+#define fid_len un._fid.len
+#define fid_data un._fid.data
+
+#endif /* SPL_ZFS_H */
diff --git a/include/spl/sys/vmem.h b/include/spl/sys/vmem.h
new file mode 100644
index 000000000..a9b12eeb9
--- /dev/null
+++ b/include/spl/sys/vmem.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VMEM_H
+#define _SPL_VMEM_H
+
+#include <sys/kmem.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+typedef struct vmem { } vmem_t;
+
+extern vmem_t *heap_arena;
+extern vmem_t *zio_alloc_arena;
+extern vmem_t *zio_arena;
+
+extern size_t vmem_size(vmem_t *vmp, int typemask);
+
+/*
+ * Memory allocation interfaces
+ */
+#define VMEM_ALLOC 0x01
+#define VMEM_FREE 0x02
+
+#ifndef VMALLOC_TOTAL
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+#endif
+
+/*
+ * vmem_* is an interface to a low level arena-based memory allocator on
+ * Illumos that is used to allocate virtual address space. The kmem SLAB
+ * allocator allocates slabs from it. Then the generic allocation functions
+ * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators.
+ *
+ * On Linux, the primary means of doing allocations is via kmalloc(), which
+ * is similarly layered on top of something called the buddy allocator. The
+ * buddy allocator is not available to kernel modules, it uses physical
+ * memory addresses rather than virtual memory addresses and is prone to
+ * fragmentation.
+ *
+ * Linux sets aside a relatively small address space for in-kernel virtual
+ * memory from which allocations can be done using vmalloc(). It might seem
+ * like a good idea to use vmalloc() to implement something similar to
+ * Illumos' allocator. However, this has the following problems:
+ *
+ * 1. Page directory table allocations are hard coded to use GFP_KERNEL.
+ * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using
+ * vmalloc() will not have proper semantics.
+ *
+ * 2. Address space exhaustion is a real issue on 32-bit platforms where
+ * only a few 100MB are available. The kernel will handle it by spinning
+ * when it runs out of address space.
+ *
+ * 3. All vmalloc() allocations and frees are protected by a single global
+ * lock which serializes all allocations.
+ *
+ * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire
+ * list. The former will sum the allocations while the latter will print
+ * them to user space in a way that user space can keep the lock held
+ * indefinitely. When the total number of mapped allocations is large
+ * (several 100,000) a large amount of time will be spent waiting on locks.
+ *
+ * 5. Linux has a wait_on_bit() locking primitive that assumes physical
+ * memory is used, it simply does not work on virtual memory. Certain
+ * Linux structures (e.g. the superblock) use them and might be embedded
+ * into a structure from Illumos. This makes using Linux virtual memory
+ * unsafe in certain situations.
+ *
+ * It follows that we cannot obtain identical semantics to those on Illumos.
+ * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in
+ * such a way that they can be used as drop-in replacements for small vmem_*
+ * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}()
+ * to them.
+ */
+
+#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__)
+#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__)
+#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz))
+#define vmem_qcache_reap(ptr) ((void)0)
+
+extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line);
+extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line);
+extern void spl_vmem_free(const void *ptr, size_t sz);
+
+int spl_vmem_init(void);
+void spl_vmem_fini(void);
+
+#endif /* _SPL_VMEM_H */
diff --git a/include/spl/sys/vmsystm.h b/include/spl/sys/vmsystm.h
new file mode 100644
index 000000000..2b48fe0e3
--- /dev/null
+++ b/include/spl/sys/vmsystm.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VMSYSTM_H
+#define _SPL_VMSYSTM_H
+
+#include <linux/mmzone.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <sys/types.h>
+#include <asm/uaccess.h>
+
+#define membar_producer() smp_wmb()
+#define physmem totalram_pages
+#define freemem (nr_free_pages() + \
+ global_page_state(NR_INACTIVE_FILE) + \
+ global_page_state(NR_INACTIVE_ANON) + \
+ global_page_state(NR_SLAB_RECLAIMABLE))
+
+#define xcopyin(from, to, size) copy_from_user(to, from, size)
+#define xcopyout(from, to, size) copy_to_user(to, from, size)
+
+static __inline__ int
+copyin(const void *from, void *to, size_t len)
+{
+ /* On error copyin routine returns -1 */
+ if (xcopyin(from, to, len))
+ return (-1);
+
+ return (0);
+}
+
+static __inline__ int
+copyout(const void *from, void *to, size_t len)
+{
+ /* On error copyout routine returns -1 */
+ if (xcopyout(from, to, len))
+ return (-1);
+
+ return (0);
+}
+
+static __inline__ int
+copyinstr(const void *from, void *to, size_t len, size_t *done)
+{
+ size_t rc;
+
+ if (len == 0)
+ return (-ENAMETOOLONG);
+
+ /* XXX: Should return ENAMETOOLONG if 'strlen(from) > len' */
+
+ memset(to, 0, len);
+ rc = copyin(from, to, len - 1);
+ if (done != NULL)
+ *done = rc;
+
+ return (0);
+}
+
+#endif /* SPL_VMSYSTM_H */
diff --git a/include/spl/sys/vnode.h b/include/spl/sys/vnode.h
new file mode 100644
index 000000000..a3f7828e7
--- /dev/null
+++ b/include/spl/sys/vnode.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_VNODE_H
+#define _SPL_VNODE_H
+
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/fcntl.h>
+#include <linux/buffer_head.h>
+#include <linux/dcache.h>
+#include <linux/namei.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/fs_struct.h>
+#include <linux/mount.h>
+#include <sys/kmem.h>
+#include <sys/mutex.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/uio.h>
+#include <sys/user.h>
+
+/*
+ * Prior to linux-2.6.33 only O_DSYNC semantics were implemented and
+ * they used the O_SYNC flag. As of linux-2.6.33 the this behavior
+ * was properly split in to O_SYNC and O_DSYNC respectively.
+ */
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+#define FREAD 1
+#define FWRITE 2
+#define FCREAT O_CREAT
+#define FTRUNC O_TRUNC
+#define FOFFMAX O_LARGEFILE
+#define FSYNC O_SYNC
+#define FDSYNC O_DSYNC
+#define FRSYNC O_SYNC
+#define FEXCL O_EXCL
+#define FDIRECT O_DIRECT
+#define FAPPEND O_APPEND
+
+#define FNODSYNC 0x10000 /* fsync pseudo flag */
+#define FNOFOLLOW 0x20000 /* don't follow symlinks */
+
+#define F_FREESP 11 /* Free file space */
+
+
+/*
+ * The vnode AT_ flags are mapped to the Linux ATTR_* flags.
+ * This allows them to be used safely with an iattr structure.
+ * The AT_XVATTR flag has been added and mapped to the upper
+ * bit range to avoid conflicting with the standard Linux set.
+ */
+#undef AT_UID
+#undef AT_GID
+
+#define AT_MODE ATTR_MODE
+#define AT_UID ATTR_UID
+#define AT_GID ATTR_GID
+#define AT_SIZE ATTR_SIZE
+#define AT_ATIME ATTR_ATIME
+#define AT_MTIME ATTR_MTIME
+#define AT_CTIME ATTR_CTIME
+
+#define ATTR_XVATTR (1 << 31)
+#define AT_XVATTR ATTR_XVATTR
+
+#define ATTR_IATTR_MASK (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | \
+ ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_FILE)
+
+#define CRCREAT 0x01
+#define RMFILE 0x02
+
+#define B_INVAL 0x01
+#define B_TRUNC 0x02
+
+#define LOOKUP_DIR 0x01
+#define LOOKUP_XATTR 0x02
+#define CREATE_XATTR_DIR 0x04
+#define ATTR_NOACLCHECK 0x20
+
+typedef enum vtype {
+ VNON = 0,
+ VREG = 1,
+ VDIR = 2,
+ VBLK = 3,
+ VCHR = 4,
+ VLNK = 5,
+ VFIFO = 6,
+ VDOOR = 7,
+ VPROC = 8,
+ VSOCK = 9,
+ VPORT = 10,
+ VBAD = 11
+} vtype_t;
+
+typedef struct vattr {
+ enum vtype va_type; /* vnode type */
+ uint_t va_mask; /* attribute bit-mask */
+ ushort_t va_mode; /* acc mode */
+ uid_t va_uid; /* owner uid */
+ gid_t va_gid; /* owner gid */
+ long va_fsid; /* fs id */
+ long va_nodeid; /* node # */
+ uint32_t va_nlink; /* # links */
+ uint64_t va_size; /* file size */
+ struct timespec va_atime; /* last acc */
+ struct timespec va_mtime; /* last mod */
+ struct timespec va_ctime; /* last chg */
+ dev_t va_rdev; /* dev */
+ uint64_t va_nblocks; /* space used */
+ uint32_t va_blksize; /* block size */
+ uint32_t va_seq; /* sequence */
+ struct dentry *va_dentry; /* dentry to wire */
+} vattr_t;
+
+typedef struct vnode {
+ struct file *v_file;
+ kmutex_t v_lock; /* protects vnode fields */
+ uint_t v_flag; /* vnode flags (see below) */
+ uint_t v_count; /* reference count */
+ void *v_data; /* private data for fs */
+ struct vfs *v_vfsp; /* ptr to containing VFS */
+ struct stdata *v_stream; /* associated stream */
+ enum vtype v_type; /* vnode type */
+ dev_t v_rdev; /* device (VCHR, VBLK) */
+ gfp_t v_gfp_mask; /* original mapping gfp mask */
+} vnode_t;
+
+typedef struct vn_file {
+ int f_fd; /* linux fd for lookup */
+ struct task_struct *f_task; /* linux task this fd belongs to */
+ struct file *f_file; /* linux file struct */
+ atomic_t f_ref; /* ref count */
+ kmutex_t f_lock; /* struct lock */
+ loff_t f_offset; /* offset */
+ vnode_t *f_vnode; /* vnode */
+ struct list_head f_list; /* list referenced file_t's */
+} file_t;
+
+extern vnode_t *vn_alloc(int flag);
+void vn_free(vnode_t *vp);
+extern vtype_t vn_mode_to_vtype(mode_t);
+extern mode_t vn_vtype_to_mode(vtype_t);
+extern int vn_open(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2);
+extern int vn_openat(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd);
+extern int vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len,
+ offset_t off, uio_seg_t seg, int x1, rlim64_t x2,
+ void *x3, ssize_t *residp);
+extern int vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4);
+extern int vn_seek(vnode_t *vp, offset_t o, offset_t *op, void *ct);
+
+extern int vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4);
+extern int vn_fsync(vnode_t *vp, int flags, void *x3, void *x4);
+extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
+ offset_t offset, void *x6, void *x7);
+extern file_t *vn_getf(int fd);
+extern void vn_releasef(int fd);
+extern void vn_areleasef(int fd, uf_info_t *fip);
+extern int vn_set_pwd(const char *filename);
+
+int spl_vn_init(void);
+void spl_vn_fini(void);
+
+#define VOP_CLOSE vn_close
+#define VOP_SEEK vn_seek
+#define VOP_GETATTR vn_getattr
+#define VOP_FSYNC vn_fsync
+#define VOP_SPACE vn_space
+#define VOP_PUTPAGE(vp, o, s, f, x1, x2) ((void)0)
+#define vn_is_readonly(vp) 0
+#define getf vn_getf
+#define releasef vn_releasef
+#define areleasef vn_areleasef
+
+extern vnode_t *rootdir;
+
+#endif /* SPL_VNODE_H */
diff --git a/include/spl/sys/wait.h b/include/spl/sys/wait.h
new file mode 100644
index 000000000..5311ff8b9
--- /dev/null
+++ b/include/spl/sys/wait.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2007-2014 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_WAIT_H
+#define _SPL_WAIT_H
+
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#ifndef HAVE_WAIT_ON_BIT_ACTION
+#define spl_wait_on_bit(word, bit, mode) wait_on_bit(word, bit, mode)
+#else
+
+static inline int
+spl_bit_wait(void *word)
+{
+ schedule();
+ return (0);
+}
+
+#define spl_wait_on_bit(word, bit, mode) \
+ wait_on_bit(word, bit, spl_bit_wait, mode)
+
+#endif /* HAVE_WAIT_ON_BIT_ACTION */
+
+#ifdef HAVE_WAIT_QUEUE_ENTRY_T
+typedef wait_queue_head_t spl_wait_queue_head_t;
+typedef wait_queue_entry_t spl_wait_queue_entry_t;
+#else
+typedef wait_queue_head_t spl_wait_queue_head_t;
+typedef wait_queue_t spl_wait_queue_entry_t;
+#endif
+
+#endif /* SPL_WAIT_H */
diff --git a/include/spl/sys/zmod.h b/include/spl/sys/zmod.h
new file mode 100644
index 000000000..95c1a3ed7
--- /dev/null
+++ b/include/spl/sys/zmod.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * z_compress_level/z_uncompress are nearly identical copies of the
+ * compress2/uncompress functions provided by the official zlib package
+ * available at http://zlib.net/. The only changes made we to slightly
+ * adapt the functions called to match the linux kernel implementation
+ * of zlib. The full zlib license follows:
+ *
+ * zlib.h -- interface of the 'zlib' general purpose compression library
+ * version 1.2.5, April 19th, 2010
+ *
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * Jean-loup Gailly
+ * Mark Adler
+ */
+
+#ifndef _SPL_ZMOD_H
+#define _SPL_ZMOD_H
+
+#include <sys/types.h>
+#include <linux/zlib.h>
+
+#ifdef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
+#define spl_zlib_deflate_workspacesize(wb, ml) \
+ zlib_deflate_workspacesize(wb, ml)
+#else
+#define spl_zlib_deflate_workspacesize(wb, ml) \
+ zlib_deflate_workspacesize()
+#endif /* HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
+
+extern int z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level);
+extern int z_uncompress(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen);
+
+int spl_zlib_init(void);
+void spl_zlib_fini(void);
+
+#endif /* SPL_ZMOD_H */
diff --git a/include/spl/sys/zone.h b/include/spl/sys/zone.h
new file mode 100644
index 000000000..b2efd13b8
--- /dev/null
+++ b/include/spl/sys/zone.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _SPL_ZONE_H
+#define _SPL_ZONE_H
+
+#include <sys/byteorder.h>
+
+#define GLOBAL_ZONEID 0
+
+#define zone_dataset_visible(x, y) (1)
+#define crgetzoneid(x) (GLOBAL_ZONEID)
+#define INGLOBALZONE(z) (1)
+
+#endif /* SPL_ZONE_H */
diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5
new file mode 100644
index 000000000..30d9fc754
--- /dev/null
+++ b/man/man5/spl-module-parameters.5
@@ -0,0 +1,357 @@
+'\" te
+.\"
+.\" Copyright 2013 Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
+.\"
+.TH SPL-MODULE-PARAMETERS 5 "Oct 28, 2017"
+.SH NAME
+spl\-module\-parameters \- SPL module parameters
+.SH DESCRIPTION
+.sp
+.LP
+Description of the different parameters to the SPL module.
+
+.SS "Module parameters"
+.sp
+.LP
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_expire\fR (uint)
+.ad
+.RS 12n
+Cache expiration is part of default Illumos cache behavior. The idea is
+that objects in magazines which have not been recently accessed should be
+returned to the slabs periodically. This is known as cache aging and
+when enabled objects will be typically returned after 15 seconds.
+.sp
+On the other hand Linux slabs are designed to never move objects back to
+the slabs unless there is memory pressure. This is possible because under
+Linux the cache will be notified when memory is low and objects can be
+released.
+.sp
+By default only the Linux method is enabled. It has been shown to improve
+responsiveness on low memory systems and not negatively impact the performance
+of systems with more memory. This policy may be changed by setting the
+\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled
+concurrently.
+.sp
+0x01 - Aging (Illumos), 0x02 - Low memory (Linux)
+.sp
+Default value: \fB0x02\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_kmem_threads\fR (uint)
+.ad
+.RS 12n
+The number of threads created for the spl_kmem_cache task queue. This task
+queue is responsible for allocating new slabs for use by the kmem caches.
+For the majority of systems and workloads only a small number of threads are
+required.
+.sp
+Default value: \fB4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_reclaim\fR (uint)
+.ad
+.RS 12n
+When this is set it prevents Linux from being able to rapidly reclaim all the
+memory held by the kmem caches. This may be useful in circumstances where
+it's preferable that Linux reclaim memory from some other subsystem first.
+Setting this will increase the likelihood out of memory events on a memory
+constrained system.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_obj_per_slab\fR (uint)
+.ad
+.RS 12n
+The preferred number of objects per slab in the cache. In general, a larger
+value will increase the caches memory footprint while decreasing the time
+required to perform an allocation. Conversely, a smaller value will minimize
+the footprint and improve cache reclaim time but individual allocations may
+take longer.
+.sp
+Default value: \fB8\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_obj_per_slab_min\fR (uint)
+.ad
+.RS 12n
+The minimum number of objects allowed per slab. Normally slabs will contain
+\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very
+large objects it's desirable to only have a few, or even just one, object per
+slab.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_max_size\fR (uint)
+.ad
+.RS 12n
+The maximum size of a kmem cache slab in MiB. This effectively limits
+the maximum cache object size to \fBspl_kmem_cache_max_size\fR /
+\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with
+object sized larger than this limit.
+.sp
+Default value: \fB32 (64-bit) or 4 (32-bit)\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_slab_limit\fR (uint)
+.ad
+.RS 12n
+For small objects the Linux slab allocator should be used to make the most
+efficient use of the memory. However, large objects are not supported by
+the Linux slab and therefore the SPL implementation is preferred. This
+value is used to determine the cutoff between a small and large object.
+.sp
+Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated
+using the Linux slab allocator, large objects use the SPL allocator. A
+cutoff of 16K was determined to be optimal for architectures using 4K pages.
+.sp
+Default value: \fB16,384\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_kmem_limit\fR (uint)
+.ad
+.RS 12n
+Depending on the size of a cache object it may be backed by kmalloc()'d
+or vmalloc()'d memory. This is because the size of the required allocation
+greatly impacts the best way to allocate the memory.
+.sp
+When objects are small and only a small number of memory pages need to be
+allocated, ideally just one, then kmalloc() is very efficient. However,
+when allocating multiple pages with kmalloc() it gets increasingly expensive
+because the pages must be physically contiguous.
+.sp
+For this reason we shift to vmalloc() for slabs of large objects which
+which removes the need for contiguous pages. We cannot use vmalloc() in
+all cases because there is significant locking overhead involved. This
+function takes a single global lock over the entire virtual address range
+which serializes all allocations. Using slightly different allocation
+functions for small and large objects allows us to handle a wide range of
+object sizes.
+.sp
+The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff
+size. One quarter the PAGE_SIZE is used as the default value because
+\fBspl_kmem_cache_obj_per_slab\fR defaults to 16. This means that at
+most we will need to allocate four contiguous pages.
+.sp
+Default value: \fBPAGE_SIZE/4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_warn\fR (uint)
+.ad
+.RS 12n
+As a general rule kmem_alloc() allocations should be small, preferably
+just a few pages since they must by physically contiguous. Therefore, a
+rate limited warning will be printed to the console for any kmem_alloc()
+which exceeds a reasonable threshold.
+.sp
+The default warning threshold is set to eight pages but capped at 32K to
+accommodate systems using large pages. This value was selected to be small
+enough to ensure the largest allocations are quickly noticed and fixed.
+But large enough to avoid logging any warnings when a allocation size is
+larger than optimal but not a serious concern. Since this value is tunable,
+developers are encouraged to set it lower when testing so any new largish
+allocations are quickly caught. These warnings may be disabled by setting
+the threshold to zero.
+.sp
+Default value: \fB32,768\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_alloc_max\fR (uint)
+.ad
+.RS 12n
+Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+Allocations which are marginally smaller than this limit may succeed but
+should still be avoided due to the expense of locating a contiguous range
+of free pages. Therefore, a maximum kmem size with reasonable safely
+margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+will quickly fail. Vmem_alloc() allocations less than or equal to this
+value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+.sp
+Default value: \fBKMALLOC_MAX_SIZE/4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_kmem_cache_magazine_size\fR (uint)
+.ad
+.RS 12n
+Cache magazines are an optimization designed to minimize the cost of
+allocating memory. They do this by keeping a per-cpu cache of recently
+freed objects, which can then be reallocated without taking a lock. This
+can improve performance on highly contended caches. However, because
+objects in magazines will prevent otherwise empty slabs from being
+immediately released this may not be ideal for low memory machines.
+.sp
+For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a
+maximum magazine size. When this value is set to 0 the magazine size will
+be automatically determined based on the object size. Otherwise magazines
+will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+may never be entirely disabled in this implementation.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_hostid\fR (ulong)
+.ad
+.RS 12n
+The system hostid, when set this can be used to uniquely identify a system.
+By default this value is set to zero which indicates the hostid is disabled.
+It can be explicitly enabled by placing a unique non-zero value in
+\fB/etc/hostid/\fR.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_hostid_path\fR (charp)
+.ad
+.RS 12n
+The expected path to locate the system hostid when specified. This value
+may be overridden for non-standard configurations.
+.sp
+Default value: \fB/etc/hostid\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_panic_halt\fR (uint)
+.ad
+.RS 12n
+Cause a kernel panic on assertion failures. When not enabled, the thread is
+halted to facilitate further debugging.
+.sp
+Set to a non-zero value to enable.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_kick\fR (uint)
+.ad
+.RS 12n
+Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will
+scan all the taskqs. If any of them have a pending task more than 5 seconds old,
+it will kick it to spawn more threads. This can be used if you find a rare
+deadlock occurs because one or more taskqs didn't spawn a thread when it should.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_bind\fR (int)
+.ad
+.RS 12n
+Bind taskq threads to specific CPUs. When enabled all taskq threads will
+be distributed evenly over the available CPUs. By default, this behavior
+is disabled to allow the Linux scheduler the maximum flexibility to determine
+where a thread should run.
+.sp
+Default value: \fB0\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_dynamic\fR (int)
+.ad
+.RS 12n
+Allow dynamic taskqs. When enabled taskqs which set the TASKQ_DYNAMIC flag
+will by default create only a single thread. New threads will be created on
+demand up to a maximum allowed number to facilitate the completion of
+outstanding tasks. Threads which are no longer needed will be promptly
+destroyed. By default this behavior is enabled but it can be disabled to
+aid performance analysis or troubleshooting.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_priority\fR (int)
+.ad
+.RS 12n
+Allow newly created taskq threads to set a non-default scheduler priority.
+When enabled the priority specified when a taskq is created will be applied
+to all threads created by that taskq. When disabled all threads will use
+the default Linux kernel thread priority. By default, this behavior is
+enabled.
+.sp
+Default value: \fB1\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_taskq_thread_sequential\fR (int)
+.ad
+.RS 12n
+The number of items a taskq worker thread must handle without interruption
+before requesting a new worker thread be spawned. This is used to control
+how quickly taskqs ramp up the number of threads processing the queue.
+Because Linux thread creation and destruction are relatively inexpensive a
+small default value has been selected. This means that normally threads will
+be created aggressively which is desirable. Increasing this value will
+result in a slower thread creation rate which may be preferable for some
+configurations.
+.sp
+Default value: \fB4\fR
+.RE
+
+.sp
+.ne 2
+.na
+\fBspl_max_show_tasks\fR (uint)
+.ad
+.RS 12n
+The maximum number of tasks per pending list in each taskq shown in
+/proc/spl/{taskq,taskq-all}. Write 0 to turn off the limit. The proc file will
+walk the lists with lock held, reading it could cause a lock up if the list
+grow too large without limiting the output. "(truncated)" will be shown if the
+list is larger than the limit.
+.sp
+Default value: \fB512\fR
+.RE
diff --git a/module/spl/THIRDPARTYLICENSE.gplv2 b/module/spl/THIRDPARTYLICENSE.gplv2
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/module/spl/THIRDPARTYLICENSE.gplv2
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/module/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/spl/THIRDPARTYLICENSE.gplv2.descrip
new file mode 100644
index 000000000..78535a8ee
--- /dev/null
+++ b/module/spl/THIRDPARTYLICENSE.gplv2.descrip
@@ -0,0 +1 @@
+COMPATIBILITY LAYER FOR OPENZFS ON LINUX
diff --git a/module/spl/spl-atomic.c b/module/spl/spl-atomic.c
new file mode 100644
index 000000000..47ed1886e
--- /dev/null
+++ b/module/spl/spl-atomic.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Atomic Implementation.
+ */
+
+#include <sys/atomic.h>
+
+#ifdef ATOMIC_SPINLOCK
+/* Global atomic lock declarations */
+DEFINE_SPINLOCK(atomic32_lock);
+DEFINE_SPINLOCK(atomic64_lock);
+
+EXPORT_SYMBOL(atomic32_lock);
+EXPORT_SYMBOL(atomic64_lock);
+#endif /* ATOMIC_SPINLOCK */
diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c
new file mode 100644
index 000000000..f0060bbdc
--- /dev/null
+++ b/module/spl/spl-condvar.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <linux/hrtimer.h>
+
+void
+__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
+{
+ ASSERT(cvp);
+ ASSERT(name == NULL);
+ ASSERT(type == CV_DEFAULT);
+ ASSERT(arg == NULL);
+
+ cvp->cv_magic = CV_MAGIC;
+ init_waitqueue_head(&cvp->cv_event);
+ init_waitqueue_head(&cvp->cv_destroy);
+ atomic_set(&cvp->cv_waiters, 0);
+ atomic_set(&cvp->cv_refs, 1);
+ cvp->cv_mutex = NULL;
+}
+EXPORT_SYMBOL(__cv_init);
+
+static int
+cv_destroy_wakeup(kcondvar_t *cvp)
+{
+ if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
+ ASSERT(cvp->cv_mutex == NULL);
+ ASSERT(!waitqueue_active(&cvp->cv_event));
+ return (1);
+ }
+
+ return (0);
+}
+
+void
+__cv_destroy(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+
+ cvp->cv_magic = CV_DESTROY;
+ atomic_dec(&cvp->cv_refs);
+
+ /* Block until all waiters are woken and references dropped. */
+ while (cv_destroy_wakeup(cvp) == 0)
+ wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
+
+ ASSERT3P(cvp->cv_mutex, ==, NULL);
+ ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
+ ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
+ ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
+}
+EXPORT_SYMBOL(__cv_destroy);
+
+static void
+cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+ atomic_inc(&cvp->cv_refs);
+
+ m = ACCESS_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ io_schedule();
+ else
+ schedule();
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+}
+
+void
+__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait);
+
+void
+__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
+}
+EXPORT_SYMBOL(__cv_wait_io);
+
+void
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait_sig);
+
+#if defined(HAVE_IO_SCHEDULE_TIMEOUT)
+#define spl_io_schedule_timeout(t) io_schedule_timeout(t)
+#else
+static void
+__cv_wakeup(unsigned long data)
+{
+ wake_up_process((struct task_struct *)data);
+}
+
+static long
+spl_io_schedule_timeout(long time_left)
+{
+ long expire_time = jiffies + time_left;
+ struct timer_list timer;
+
+ init_timer(&timer);
+ setup_timer(&timer, __cv_wakeup, (unsigned long)current);
+ timer.expires = expire_time;
+ add_timer(&timer);
+
+ io_schedule();
+
+ del_timer_sync(&timer);
+ time_left = expire_time - jiffies;
+
+ return (time_left < 0 ? 0 : time_left);
+}
+#endif
+
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+ int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ clock_t time_left;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ /* XXX - Does not handle jiffie wrap properly */
+ time_left = expire_time - jiffies;
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = ACCESS_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ time_left = spl_io_schedule_timeout(time_left);
+ else
+ time_left = schedule_timeout(time_left);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+ return (time_left > 0 ? time_left : -1);
+}
+
+clock_t
+__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait);
+
+clock_t
+__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 1));
+}
+EXPORT_SYMBOL(__cv_timedwait_io);
+
+clock_t
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_INTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait_sig);
+
+/*
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+ int state)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ hrtime_t time_left;
+ ktime_t ktime_left;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ time_left = expire_time - gethrtime();
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = ACCESS_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ /*
+ * Allow a 100 us range to give kernel an opportunity to coalesce
+ * interrupts
+ */
+ ktime_left = ktime_set(0, time_left);
+ schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC,
+ HRTIMER_MODE_REL);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ mutex_enter(mp);
+ time_left = expire_time - gethrtime();
+ return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1);
+}
+
+/*
+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
+ */
+static clock_t
+cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag, int state)
+{
+ if (res > 1) {
+ /*
+ * Align expiration to the specified resolution.
+ */
+ if (flag & CALLOUT_FLAG_ROUNDUP)
+ tim += res - 1;
+ tim = (tim / res) * res;
+ }
+
+ if (!(flag & CALLOUT_FLAG_ABSOLUTE))
+ tim += gethrtime();
+
+ return (__cv_timedwait_hires(cvp, mp, tim, state));
+}
+
+clock_t
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+ int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_UNINTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_hires);
+
+clock_t
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_INTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_sig_hires);
+
+void
+__cv_signal(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+ * waiter will be set runable with each call to wake_up().
+ * Additionally wake_up() holds a spin_lock assoicated with
+ * the wait queue to ensure we don't race waking up processes.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_signal);
+
+void
+__cv_broadcast(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * Wake_up_all() will wake up all waiters even those which
+ * have the WQ_FLAG_EXCLUSIVE flag set.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up_all(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_broadcast);
diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c
new file mode 100644
index 000000000..ea3e903f9
--- /dev/null
+++ b/module/spl/spl-cred.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/cred.h>
+
+static int
+#ifdef HAVE_KUIDGID_T
+cr_groups_search(const struct group_info *group_info, kgid_t grp)
+#else
+cr_groups_search(const struct group_info *group_info, gid_t grp)
+#endif
+{
+ unsigned int left, right, mid;
+ int cmp;
+
+ if (!group_info)
+ return (0);
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ mid = (left + right) / 2;
+ cmp = KGID_TO_SGID(grp) -
+ KGID_TO_SGID(GROUP_AT(group_info, mid));
+
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return (1);
+ }
+ return (0);
+}
+
+/* Hold a reference on the credential */
+void
+crhold(cred_t *cr)
+{
+ (void) get_cred((const cred_t *)cr);
+}
+
+/* Free a reference on the credential */
+void
+crfree(cred_t *cr)
+{
+ put_cred((const cred_t *)cr);
+}
+
+/* Return the number of supplemental groups */
+int
+crgetngroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = gi->ngroups;
+#ifndef HAVE_GROUP_INFO_GID
+ /*
+ * For Linux <= 4.8,
+ * crgetgroups will only returns gi->blocks[0], which contains only
+ * the first NGROUPS_PER_BLOCK groups.
+ */
+ if (rc > NGROUPS_PER_BLOCK) {
+ WARN_ON_ONCE(1);
+ rc = NGROUPS_PER_BLOCK;
+ }
+#endif
+ return (rc);
+}
+
+/*
+ * Return an array of supplemental gids. The returned address is safe
+ * to use as long as the caller has taken a reference with crhold().
+ *
+ * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d
+ * array via ->gid.
+ */
+gid_t *
+crgetgroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ gid_t *gids = NULL;
+
+ gi = cr->group_info;
+#ifdef HAVE_GROUP_INFO_GID
+ gids = KGIDP_TO_SGIDP(gi->gid);
+#else
+ if (gi->nblocks > 0)
+ gids = KGIDP_TO_SGIDP(gi->blocks[0]);
+#endif
+ return (gids);
+}
+
+/* Check if the passed gid is available in supplied credential. */
+int
+groupmember(gid_t gid, const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = cr_groups_search(gi, SGID_TO_KGID(gid));
+
+ return (rc);
+}
+
+/* Return the effective user id */
+uid_t
+crgetuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->euid));
+}
+
+/* Return the real user id */
+uid_t
+crgetruid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->uid));
+}
+
+/* Return the saved user id */
+uid_t
+crgetsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->suid));
+}
+
+/* Return the filesystem user id */
+uid_t
+crgetfsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->fsuid));
+}
+
+/* Return the effective group id */
+gid_t
+crgetgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->egid));
+}
+
+/* Return the real group id */
+gid_t
+crgetrgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->gid));
+}
+
+/* Return the saved group id */
+gid_t
+crgetsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->sgid));
+}
+
+/* Return the filesystem group id */
+gid_t
+crgetfsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->fsgid));
+}
+
+EXPORT_SYMBOL(crhold);
+EXPORT_SYMBOL(crfree);
+EXPORT_SYMBOL(crgetuid);
+EXPORT_SYMBOL(crgetruid);
+EXPORT_SYMBOL(crgetsuid);
+EXPORT_SYMBOL(crgetfsuid);
+EXPORT_SYMBOL(crgetgid);
+EXPORT_SYMBOL(crgetrgid);
+EXPORT_SYMBOL(crgetsgid);
+EXPORT_SYMBOL(crgetfsgid);
+EXPORT_SYMBOL(crgetngroups);
+EXPORT_SYMBOL(crgetgroups);
+EXPORT_SYMBOL(groupmember);
diff --git a/module/spl/spl-err.c b/module/spl/spl-err.c
new file mode 100644
index 000000000..6b71296e8
--- /dev/null
+++ b/module/spl/spl-err.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Error Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <linux/ratelimit.h>
+
+/*
+ * It is often useful to actually have the panic crash the node so you
+ * can then get notified of the event, get the crashdump for later
+ * analysis and other such goodies.
+ * But we would still default to the current default of not to do that.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_panic_halt;
+module_param(spl_panic_halt, uint, 0644);
+MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
+/* END CSTYLED */
+
+/*
+ * Limit the number of stack traces dumped to not more than 5 every
+ * 60 seconds to prevent denial-of-service attacks from debug code.
+ */
+DEFINE_RATELIMIT_STATE(dumpstack_ratelimit_state, 60 * HZ, 5);
+
+void
+spl_dumpstack(void)
+{
+ if (__ratelimit(&dumpstack_ratelimit_state)) {
+ printk("Showing stack for process %d\n", current->pid);
+ dump_stack();
+ }
+}
+EXPORT_SYMBOL(spl_dumpstack);
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char msg[MAXMSGLEN];
+ va_list ap;
+
+ newfile = strrchr(file, '/');
+ if (newfile != NULL)
+ newfile = newfile + 1;
+ else
+ newfile = file;
+
+ va_start(ap, fmt);
+ (void) vsnprintf(msg, sizeof (msg), fmt, ap);
+ va_end(ap);
+
+ printk(KERN_EMERG "%s", msg);
+ printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
+ if (spl_panic_halt)
+ panic("%s", msg);
+
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+
+ /* Unreachable */
+ return (1);
+}
+EXPORT_SYMBOL(spl_panic);
+
+void
+vcmn_err(int ce, const char *fmt, va_list ap)
+{
+ char msg[MAXMSGLEN];
+
+ vsnprintf(msg, MAXMSGLEN - 1, fmt, ap);
+
+ switch (ce) {
+ case CE_IGNORE:
+ break;
+ case CE_CONT:
+ printk("%s", msg);
+ break;
+ case CE_NOTE:
+ printk(KERN_NOTICE "NOTICE: %s\n", msg);
+ break;
+ case CE_WARN:
+ printk(KERN_WARNING "WARNING: %s\n", msg);
+ break;
+ case CE_PANIC:
+ printk(KERN_EMERG "PANIC: %s\n", msg);
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+ }
+} /* vcmn_err() */
+EXPORT_SYMBOL(vcmn_err);
+
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vcmn_err(ce, fmt, ap);
+ va_end(ap);
+} /* cmn_err() */
+EXPORT_SYMBOL(cmn_err);
diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c
new file mode 100644
index 000000000..b38fe254c
--- /dev/null
+++ b/module/spl/spl-generic.c
@@ -0,0 +1,775 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/systeminfo.h>
+#include <sys/vmsystm.h>
+#include <sys/kobj.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/taskq.h>
+#include <sys/tsd.h>
+#include <sys/zmod.h>
+#include <sys/debug.h>
+#include <sys/proc.h>
+#include <sys/kstat.h>
+#include <sys/file.h>
+#include <linux/ctype.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/strings.h>
+#include <linux/kmod.h>
+
+char spl_version[32] = "SPL v" SPL_META_VERSION "-" SPL_META_RELEASE;
+EXPORT_SYMBOL(spl_version);
+
+/* BEGIN CSTYLED */
+unsigned long spl_hostid = 0;
+EXPORT_SYMBOL(spl_hostid);
+module_param(spl_hostid, ulong, 0644);
+MODULE_PARM_DESC(spl_hostid, "The system hostid.");
+/* END CSTYLED */
+
+proc_t p0;
+EXPORT_SYMBOL(p0);
+
+/*
+ * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *
+ * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
+ * is to provide bytes containing random numbers. It is mapped to /dev/urandom
+ * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
+ * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
+ * we can implement it using a fast PRNG that we seed using Linux' actual
+ * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
+ * with an independent seed so that all calls to random_get_pseudo_bytes() are
+ * free of atomic instructions.
+ *
+ * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
+ * to generate words larger than 128 bits will paradoxically be limited to
+ * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
+ * 128-bit words and selecting the first will implicitly select the second. If
+ * a caller finds this behavior undesireable, random_get_bytes() should be used
+ * instead.
+ *
+ * XXX: Linux interrupt handlers that trigger within the critical section
+ * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * see the same numbers. Nothing in the code currently calls this in an
+ * interrupt handler, so this is considered to be okay. If that becomes a
+ * problem, we could create a set of per-cpu variables for interrupt handlers
+ * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
+ * true.
+ */
+static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy);
+
+/*
+ * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
+ * file:
+ *
+ * http://xorshift.di.unimi.it/xorshift128plus.c
+ */
+
+static inline uint64_t
+spl_rand_next(uint64_t *s)
+{
+ uint64_t s1 = s[0];
+ const uint64_t s0 = s[1];
+ s[0] = s0;
+ s1 ^= s1 << 23; // a
+ s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
+ return (s[1] + s0);
+}
+
+static inline void
+spl_rand_jump(uint64_t *s)
+{
+ static const uint64_t JUMP[] =
+ { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+
+ uint64_t s0 = 0;
+ uint64_t s1 = 0;
+ int i, b;
+ for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
+ for (b = 0; b < 64; b++) {
+ if (JUMP[i] & 1ULL << b) {
+ s0 ^= s[0];
+ s1 ^= s[1];
+ }
+ (void) spl_rand_next(s);
+ }
+
+ s[0] = s0;
+ s[1] = s1;
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+ uint64_t *xp, s[2];
+
+ ASSERT(ptr);
+
+ xp = get_cpu_var(spl_pseudo_entropy);
+
+ s[0] = xp[0];
+ s[1] = xp[1];
+
+ while (len) {
+ union {
+ uint64_t ui64;
+ uint8_t byte[sizeof (uint64_t)];
+ }entropy;
+ int i = MIN(len, sizeof (uint64_t));
+
+ len -= i;
+ entropy.ui64 = spl_rand_next(s);
+
+ while (i--)
+ *ptr++ = entropy.byte[i];
+ }
+
+ xp[0] = s[0];
+ xp[1] = s[1];
+
+ put_cpu_var(spl_pseudo_entropy);
+
+ return (0);
+}
+
+
+EXPORT_SYMBOL(random_get_pseudo_bytes);
+
+#if BITS_PER_LONG == 32
+/*
+ * Support 64/64 => 64 division on a 32-bit platform. While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed. There are cases which return incorrect
+ * results as late as linux-2.6.35. Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'. The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+ register int n = 0;
+
+ if (x == 0)
+ return (64);
+
+ if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+ if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+ if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; }
+ if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; }
+ if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; }
+ if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; }
+
+ return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portibility between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+ (void) do_div(u, v);
+ return (u);
+}
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+ uint64_t u0, u1, v1, q0, q1, k;
+ int n;
+
+ if (v >> 32 == 0) { // If v < 2**32:
+ if (u >> 32 < v) { // If u/v cannot overflow,
+ return (__div_u64(u, v)); // just do one division.
+ } else { // If u/v would overflow:
+ u1 = u >> 32; // Break u into two halves.
+ u0 = u & 0xFFFFFFFF;
+ q1 = __div_u64(u1, v); // First quotient digit.
+ k = u1 - q1 * v; // First remainder, < v.
+ u0 += (k << 32);
+ q0 = __div_u64(u0, v); // Seconds quotient digit.
+ return ((q1 << 32) + q0);
+ }
+ } else { // If v >= 2**32:
+ n = nlz64(v); // 0 <= n <= 31.
+ v1 = (v << n) >> 32; // Normalize divisor, MSB is 1.
+ u1 = u >> 1; // To ensure no overflow.
+ q1 = __div_u64(u1, v1); // Get quotient from
+ q0 = (q1 << n) >> 31; // Undo normalization and
+ // division of u by 2.
+ if (q0 != 0) // Make q0 correct or
+ q0 = q0 - 1; // too small by 1.
+ if ((u - q0 * v) >= v)
+ q0 = q0 + 1; // Now q0 is correct.
+
+ return (q0);
+ }
+}
+EXPORT_SYMBOL(__udivdi3);
+
+/* BEGIN CSTYLED */
+#ifndef abs64
+#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+/* END CSTYLED */
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+ int64_t q, t;
+ q = __udivdi3(abs64(u), abs64(v));
+ t = (u ^ v) >> 63; // If u, v have different
+ return ((q ^ t) - t); // signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+ return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+ uint64_t q = __udivdi3(n, d);
+ if (r)
+ *r = n - d * q;
+ return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+ int64_t q, rr;
+ boolean_t nn = B_FALSE;
+ boolean_t nd = B_FALSE;
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0) {
+ nd = B_TRUE;
+ d = -d;
+ }
+
+ q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+ if (nn != nd)
+ q = -q;
+ if (nn)
+ rr = -rr;
+ if (r)
+ *r = rr;
+ return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}. The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return. All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+ uint64_t res;
+ uint64_t mod;
+
+ res = __udivdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+ int64_t res;
+ uint64_t mod;
+
+ res = __divdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+#endif /* BITS_PER_LONG */
+
+/*
+ * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
+ * ddi_strtol(9F) man page. I have not verified the behavior of these
+ * functions against their Solaris counterparts. It is possible that I
+ * may have misinterpreted the man page or the man page is incorrect.
+ */
+int ddi_strtoul(const char *, char **, int, unsigned long *);
+int ddi_strtol(const char *, char **, int, long *);
+int ddi_strtoull(const char *, char **, int, unsigned long long *);
+int ddi_strtoll(const char *, char **, int, long long *);
+
+#define define_ddi_strtoux(type, valtype) \
+int ddi_strtou##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ valtype last_value, value = 0; \
+ char *ptr = (char *)str; \
+ int flag = 1, digit; \
+ \
+ if (strlen(ptr) == 0) \
+ return (EINVAL); \
+ \
+ /* Auto-detect base based on prefix */ \
+ if (!base) { \
+ if (str[0] == '0') { \
+ if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
+ base = 16; /* hex */ \
+ ptr += 2; \
+ } else if (str[1] >= '0' && str[1] < 8) { \
+ base = 8; /* octal */ \
+ ptr += 1; \
+ } else { \
+ return (EINVAL); \
+ } \
+ } else { \
+ base = 10; /* decimal */ \
+ } \
+ } \
+ \
+ while (1) { \
+ if (isdigit(*ptr)) \
+ digit = *ptr - '0'; \
+ else if (isalpha(*ptr)) \
+ digit = tolower(*ptr) - 'a' + 10; \
+ else \
+ break; \
+ \
+ if (digit >= base) \
+ break; \
+ \
+ last_value = value; \
+ value = value * base + digit; \
+ if (last_value > value) /* Overflow */ \
+ return (ERANGE); \
+ \
+ flag = 1; \
+ ptr++; \
+ } \
+ \
+ if (flag) \
+ *result = value; \
+ \
+ if (endptr) \
+ *endptr = (char *)(flag ? ptr : str); \
+ \
+ return (0); \
+} \
+
+#define define_ddi_strtox(type, valtype) \
+int ddi_strto##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ int rc; \
+ \
+ if (*str == '-') { \
+ rc = ddi_strtou##type(str + 1, endptr, base, result); \
+ if (!rc) { \
+ if (*endptr == str + 1) \
+ *endptr = (char *)str; \
+ else \
+ *result = -*result; \
+ } \
+ } else { \
+ rc = ddi_strtou##type(str, endptr, base, result); \
+ } \
+ \
+ return (rc); \
+}
+
+define_ddi_strtoux(l, unsigned long)
+define_ddi_strtox(l, long)
+define_ddi_strtoux(ll, unsigned long long)
+define_ddi_strtox(ll, long long)
+
+EXPORT_SYMBOL(ddi_strtoul);
+EXPORT_SYMBOL(ddi_strtol);
+EXPORT_SYMBOL(ddi_strtoll);
+EXPORT_SYMBOL(ddi_strtoull);
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyin(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyin);
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyout(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyout);
+
+/*
+ * Read the unique system identifier from the /etc/hostid file.
+ *
+ * The behavior of /usr/bin/hostid on Linux systems with the
+ * regular eglibc and coreutils is:
+ *
+ * 1. Generate the value if the /etc/hostid file does not exist
+ * or if the /etc/hostid file is less than four bytes in size.
+ *
+ * 2. If the /etc/hostid file is at least 4 bytes, then return
+ * the first four bytes [0..3] in native endian order.
+ *
+ * 3. Always ignore bytes [4..] if they exist in the file.
+ *
+ * Only the first four bytes are significant, even on systems that
+ * have a 64-bit word size.
+ *
+ * See:
+ *
+ * eglibc: sysdeps/unix/sysv/linux/gethostid.c
+ * coreutils: src/hostid.c
+ *
+ * Notes:
+ *
+ * The /etc/hostid file on Solaris is a text file that often reads:
+ *
+ * # DO NOT EDIT
+ * "0123456789"
+ *
+ * Directly copying this file to Linux results in a constant
+ * hostid of 4f442023 because the default comment constitutes
+ * the first four bytes of the file.
+ *
+ */
+
+char *spl_hostid_path = HW_HOSTID_PATH;
+module_param(spl_hostid_path, charp, 0444);
+MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
+
+static int
+hostid_read(uint32_t *hostid)
+{
+ uint64_t size;
+ struct _buf *file;
+ uint32_t value = 0;
+ int error;
+
+ file = kobj_open_file(spl_hostid_path);
+ if (file == (struct _buf *)-1)
+ return (ENOENT);
+
+ error = kobj_get_filesize(file, &size);
+ if (error) {
+ kobj_close_file(file);
+ return (error);
+ }
+
+ if (size < sizeof (HW_HOSTID_MASK)) {
+ kobj_close_file(file);
+ return (EINVAL);
+ }
+
+ /*
+ * Read directly into the variable like eglibc does.
+ * Short reads are okay; native behavior is preserved.
+ */
+ error = kobj_read_file(file, (char *)&value, sizeof (value), 0);
+ if (error < 0) {
+ kobj_close_file(file);
+ return (EIO);
+ }
+
+ /* Mask down to 32 bits like coreutils does. */
+ *hostid = (value & HW_HOSTID_MASK);
+ kobj_close_file(file);
+
+ return (0);
+}
+
+/*
+ * Return the system hostid. Preferentially use the spl_hostid module option
+ * when set, otherwise use the value in the /etc/hostid file.
+ */
+uint32_t
+zone_get_hostid(void *zone)
+{
+ uint32_t hostid;
+
+ ASSERT3P(zone, ==, NULL);
+
+ if (spl_hostid != 0)
+ return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
+
+ if (hostid_read(&hostid) == 0)
+ return (hostid);
+
+ return (0);
+}
+EXPORT_SYMBOL(zone_get_hostid);
+
+static int
+spl_kvmem_init(void)
+{
+ int rc = 0;
+
+ rc = spl_kmem_init();
+ if (rc)
+ return (rc);
+
+ rc = spl_vmem_init();
+ if (rc) {
+ spl_kmem_fini();
+ return (rc);
+ }
+
+ return (rc);
+}
+
+/*
+ * We initialize the random number generator with 128 bits of entropy from the
+ * system random number generator. In the improbable case that we have a zero
+ * seed, we fallback to the system jiffies, unless it is also zero, in which
+ * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
+ * initialize each of the per-cpu seeds so that the sequences generated on each
+ * CPU are guaranteed to never overlap in practice.
+ */
+static void __init
+spl_random_init(void)
+{
+ uint64_t s[2];
+ int i;
+
+ get_random_bytes(s, sizeof (s));
+
+ if (s[0] == 0 && s[1] == 0) {
+ if (jiffies != 0) {
+ s[0] = jiffies;
+ s[1] = ~0 - jiffies;
+ } else {
+ (void) memcpy(s, "improbable seed", sizeof (s));
+ }
+ printk("SPL: get_random_bytes() returned 0 "
+ "when generating random seed. Setting initial seed to "
+ "0x%016llx%016llx.", cpu_to_be64(s[0]), cpu_to_be64(s[1]));
+ }
+
+ for_each_possible_cpu(i) {
+ uint64_t *wordp = per_cpu(spl_pseudo_entropy, i);
+
+ spl_rand_jump(s);
+
+ wordp[0] = s[0];
+ wordp[1] = s[1];
+ }
+}
+
+static void
+spl_kvmem_fini(void)
+{
+ spl_vmem_fini();
+ spl_kmem_fini();
+}
+
+static int __init
+spl_init(void)
+{
+ int rc = 0;
+
+ bzero(&p0, sizeof (proc_t));
+ spl_random_init();
+
+ if ((rc = spl_kvmem_init()))
+ goto out1;
+
+ if ((rc = spl_mutex_init()))
+ goto out2;
+
+ if ((rc = spl_rw_init()))
+ goto out3;
+
+ if ((rc = spl_tsd_init()))
+ goto out4;
+
+ if ((rc = spl_taskq_init()))
+ goto out5;
+
+ if ((rc = spl_kmem_cache_init()))
+ goto out6;
+
+ if ((rc = spl_vn_init()))
+ goto out7;
+
+ if ((rc = spl_proc_init()))
+ goto out8;
+
+ if ((rc = spl_kstat_init()))
+ goto out9;
+
+ if ((rc = spl_zlib_init()))
+ goto out10;
+
+ printk(KERN_NOTICE "SPL: Loaded module v%s-%s%s\n", SPL_META_VERSION,
+ SPL_META_RELEASE, SPL_DEBUG_STR);
+ return (rc);
+
+out10:
+ spl_kstat_fini();
+out9:
+ spl_proc_fini();
+out8:
+ spl_vn_fini();
+out7:
+ spl_kmem_cache_fini();
+out6:
+ spl_taskq_fini();
+out5:
+ spl_tsd_fini();
+out4:
+ spl_rw_fini();
+out3:
+ spl_mutex_fini();
+out2:
+ spl_kvmem_fini();
+out1:
+ printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer "
+ "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE,
+ SPL_DEBUG_STR, rc);
+
+ return (rc);
+}
+
+static void __exit
+spl_fini(void)
+{
+ printk(KERN_NOTICE "SPL: Unloaded module v%s-%s%s\n",
+ SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR);
+ spl_zlib_fini();
+ spl_kstat_fini();
+ spl_proc_fini();
+ spl_vn_fini();
+ spl_kmem_cache_fini();
+ spl_taskq_fini();
+ spl_tsd_fini();
+ spl_rw_fini();
+ spl_mutex_fini();
+ spl_kvmem_fini();
+}
+
+module_init(spl_init);
+module_exit(spl_fini);
+
+MODULE_DESCRIPTION("Solaris Porting Layer");
+MODULE_AUTHOR(SPL_META_AUTHOR);
+MODULE_LICENSE(SPL_META_LICENSE);
+MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE);
diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c
new file mode 100644
index 000000000..5492c6a46
--- /dev/null
+++ b/module/spl/spl-kmem-cache.c
@@ -0,0 +1,1769 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <sys/taskq.h>
+#include <sys/timer.h>
+#include <sys/vmem.h>
+#include <sys/wait.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/prefetch.h>
+
+/*
+ * Within the scope of spl-kmem.c file the kmem_cache_* definitions
+ * are removed to allow access to the real Linux slab allocator.
+ */
+#undef kmem_cache_destroy
+#undef kmem_cache_create
+#undef kmem_cache_alloc
+#undef kmem_cache_free
+
+
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
+/*
+ * Cache expiration was implemented because it was part of the default Solaris
+ * kmem_cache behavior. The idea is that per-cpu objects which haven't been
+ * accessed in several seconds should be returned to the cache. On the other
+ * hand Linux slabs never move objects back to the slabs unless there is
+ * memory pressure on the system. By default the Linux method is enabled
+ * because it has been shown to improve responsiveness on low memory systems.
+ * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
+EXPORT_SYMBOL(spl_kmem_cache_expire);
+module_param(spl_kmem_cache_expire, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
+
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory. They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches. However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size. When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size. Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+ "Default magazine size (2-256), set automatically (0)");
+
+/*
+ * The default behavior is to report the number of objects remaining in the
+ * cache. This allows the Linux VM to repeatedly reclaim objects from the
+ * cache when memory is low satisfy other memory allocations. Alternately,
+ * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
+ * is reclaimed. This may increase the likelihood of out of memory events.
+ */
+unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
+module_param(spl_kmem_cache_reclaim, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
+
+unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
+
+unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
+module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
+ "Minimal number of objects per slab");
+
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
+module_param(spl_kmem_cache_max_size, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
+
+/*
+ * For small objects the Linux slab allocator should be used to make the most
+ * efficient use of the memory. However, large objects are not supported by
+ * the Linux slab and therefore the SPL implementation is preferred. A cutoff
+ * of 16K was determined to be optimal for architectures using 4K pages.
+ */
+#if PAGE_SIZE == 4096
+unsigned int spl_kmem_cache_slab_limit = 16384;
+#else
+unsigned int spl_kmem_cache_slab_limit = 0;
+#endif
+module_param(spl_kmem_cache_slab_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
+ "Objects less than N bytes use the Linux slab");
+
+/*
+ * This value defaults to a threshold designed to avoid allocations which
+ * have been deemed costly by the kernel.
+ */
+unsigned int spl_kmem_cache_kmem_limit =
+ ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
+ SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_kmem_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
+ "Objects less than N bytes use the kmalloc");
+
+/*
+ * The number of threads available to allocate new slabs for caches. This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+ "Number of spl_kmem_cache threads");
+/* END CSTYLED */
+
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implementation I cannot use it to emulate the Solaris APIs. I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors. Recent versions of the Linux
+ * kernel have removed support for destructors. This is a deal
+ * breaker for the SPL which contains particularly expensive
+ * initializers for mutex's, condition variables, etc. We also
+ * require a minimal level of cleanup for these data types unlike
+ * many Linux data types which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab. Callers of the Solaris slab
+ * expect it to work well for both small are very large allocations.
+ * Because of memory fragmentation the Linux slab which is backed
+ * by kmalloc'ed memory performs very badly when confronted with
+ * large numbers of large allocations. Basing the slab on the
+ * virtual address space removes the need for contiguous pages
+ * and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features. It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches. This will seriously
+ * constrain the size of the slab caches and their performance.
+ */
+
+struct list_head spl_kmem_cache_list; /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */
+
+static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
+
+SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
+SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
+ spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
+
+static void *
+kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ ptr = (void *)__get_free_pages(lflags, get_order(size));
+ } else {
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+ }
+
+ /* Resulting allocated memory will be page aligned */
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ return (ptr);
+}
+
+static void
+kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
+{
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ /*
+ * The Linux direct reclaim path uses this out of band value to
+ * determine if forward progress is being made. Normally this is
+ * incremented by kmem_freepages() which is part of the various
+ * Linux slab implementations. However, since we are using none
+ * of that infrastructure we are responsible for incrementing it.
+ */
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ free_pages((unsigned long)ptr, get_order(size));
+ } else {
+ vfree(ptr);
+ }
+}
+
+/*
+ * Required space for each aligned sks.
+ */
+static inline uint32_t
+spl_sks_size(spl_kmem_cache_t *skc)
+{
+ return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each aligned object.
+ */
+static inline uint32_t
+spl_obj_size(spl_kmem_cache_t *skc)
+{
+ uint32_t align = skc->skc_obj_align;
+
+ return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+ P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
+}
+
+/*
+ * Lookup the spl_kmem_object_t for an object given that object.
+ */
+static inline spl_kmem_obj_t *
+spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
+{
+ return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each offslab object taking in to account alignment
+ * restrictions and the power-of-two requirement of kv_alloc().
+ */
+static inline uint32_t
+spl_offslab_size(spl_kmem_cache_t *skc)
+{
+ return (1UL << (fls64(spl_obj_size(skc)) + 1));
+}
+
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
+ * actual objects in to one large address space to minimize the number
+ * of calls to the allocator. It is far better to do a few large
+ * allocations and then subdivide it ourselves. Now which allocator
+ * we use requires balancing a few trade offs.
+ *
+ * For small objects we use kmem_alloc() because as long as you are
+ * only requesting a small number of pages (ideally just one) its cheap.
+ * However, when you start requesting multiple pages with kmem_alloc()
+ * it gets increasingly expensive since it requires contiguous pages.
+ * For this reason we shift to vmem_alloc() for slabs of large objects
+ * which removes the need for contiguous pages. We do not use
+ * vmem_alloc() in all cases because there is significant locking
+ * overhead in __get_vm_area_node(). This function takes a single
+ * global lock when acquiring an available virtual address range which
+ * serializes all vmem_alloc()'s for all slab caches. Using slightly
+ * different allocation functions for small and large objects should
+ * give us the best of both worlds.
+ *
+ * KMC_ONSLAB KMC_OFFSLAB
+ *
+ * +------------------------+ +-----------------+
+ * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
+ * | skc_obj_size <-+ | | +-----------------+ | |
+ * | spl_kmem_obj_t | | | |
+ * | skc_obj_size <---+ | +-----------------+ | |
+ * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
+ * | ... v | | spl_kmem_obj_t | |
+ * +------------------------+ +-----------------+ v
+ */
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+ spl_kmem_obj_t *sko, *n;
+ void *base, *obj;
+ uint32_t obj_size, offslab_size = 0;
+ int i, rc = 0;
+
+ base = kv_alloc(skc, skc->skc_slab_size, flags);
+ if (base == NULL)
+ return (NULL);
+
+ sks = (spl_kmem_slab_t *)base;
+ sks->sks_magic = SKS_MAGIC;
+ sks->sks_objs = skc->skc_slab_objs;
+ sks->sks_age = jiffies;
+ sks->sks_cache = skc;
+ INIT_LIST_HEAD(&sks->sks_list);
+ INIT_LIST_HEAD(&sks->sks_free_list);
+ sks->sks_ref = 0;
+ obj_size = spl_obj_size(skc);
+
+ if (skc->skc_flags & KMC_OFFSLAB)
+ offslab_size = spl_offslab_size(skc);
+
+ for (i = 0; i < sks->sks_objs; i++) {
+ if (skc->skc_flags & KMC_OFFSLAB) {
+ obj = kv_alloc(skc, offslab_size, flags);
+ if (!obj) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ } else {
+ obj = base + spl_sks_size(skc) + (i * obj_size);
+ }
+
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+ sko = spl_sko_from_obj(skc, obj);
+ sko->sko_addr = obj;
+ sko->sko_magic = SKO_MAGIC;
+ sko->sko_slab = sks;
+ INIT_LIST_HEAD(&sko->sko_list);
+ list_add_tail(&sko->sko_list, &sks->sks_free_list);
+ }
+
+out:
+ if (rc) {
+ if (skc->skc_flags & KMC_OFFSLAB)
+ list_for_each_entry_safe(sko,
+ n, &sks->sks_free_list, sko_list) {
+ kv_free(skc, sko->sko_addr, offslab_size);
+ }
+
+ kv_free(skc, base, skc->skc_slab_size);
+ sks = NULL;
+ }
+
+ return (sks);
+}
+
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
+ */
+static void
+spl_slab_free(spl_kmem_slab_t *sks,
+ struct list_head *sks_list, struct list_head *sko_list)
+{
+ spl_kmem_cache_t *skc;
+
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref == 0);
+
+ skc = sks->sks_cache;
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Update slab/objects counters in the cache, then remove the
+ * slab from the skc->skc_partial_list. Finally add the slab
+ * and all its objects in to the private work lists where the
+ * destructors will be called and the memory freed to the system.
+ */
+ skc->skc_obj_total -= sks->sks_objs;
+ skc->skc_slab_total--;
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, sks_list);
+ list_splice_init(&sks->sks_free_list, sko_list);
+}
+
+/*
+ * Reclaim empty slabs at the end of the partial list.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc)
+{
+ spl_kmem_slab_t *sks, *m;
+ spl_kmem_obj_t *sko, *n;
+ LIST_HEAD(sks_list);
+ LIST_HEAD(sko_list);
+ uint32_t size = 0;
+
+ /*
+ * Empty slabs and objects must be moved to a private list so they
+ * can be safely freed outside the spin lock. All empty slabs are
+ * at the end of skc->skc_partial_list, therefore once a non-empty
+ * slab is found we can stop scanning.
+ */
+ spin_lock(&skc->skc_lock);
+ list_for_each_entry_safe_reverse(sks, m,
+ &skc->skc_partial_list, sks_list) {
+
+ if (sks->sks_ref > 0)
+ break;
+
+ spl_slab_free(sks, &sks_list, &sko_list);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ /*
+ * The following two loops ensure all the object destructors are
+ * run, any offslab objects are freed, and the slabs themselves
+ * are freed. This is all done outside the skc->skc_lock since
+ * this allows the destructor to sleep, and allows us to perform
+ * a conditional reschedule when a freeing a large number of
+ * objects and slabs back to the system.
+ */
+ if (skc->skc_flags & KMC_OFFSLAB)
+ size = spl_offslab_size(skc);
+
+ list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+
+ if (skc->skc_flags & KMC_OFFSLAB)
+ kv_free(skc, sko->sko_addr, size);
+ }
+
+ list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ kv_free(skc, sks, skc->skc_slab_size);
+ }
+}
+
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+ struct rb_node *node = root->rb_node;
+ spl_kmem_emergency_t *ske;
+ unsigned long address = (unsigned long)obj;
+
+ while (node) {
+ ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+ if (address < ske->ske_obj)
+ node = node->rb_left;
+ else if (address > ske->ske_obj)
+ node = node->rb_right;
+ else
+ return (ske);
+ }
+
+ return (NULL);
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ spl_kmem_emergency_t *ske_tmp;
+ unsigned long address = ske->ske_obj;
+
+ while (*new) {
+ ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+ parent = *new;
+ if (address < ske_tmp->ske_obj)
+ new = &((*new)->rb_left);
+ else if (address > ske_tmp->ske_obj)
+ new = &((*new)->rb_right);
+ else
+ return (0);
+ }
+
+ rb_link_node(&ske->ske_node, parent, new);
+ rb_insert_color(&ske->ske_node, root);
+
+ return (1);
+}
+
+/*
+ * Allocate a single emergency object and track it in a red black tree.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+ int empty;
+
+ /* Last chance use a partial slab if one now exists */
+ spin_lock(&skc->skc_lock);
+ empty = list_empty(&skc->skc_partial_list);
+ spin_unlock(&skc->skc_lock);
+ if (!empty)
+ return (-EEXIST);
+
+ ske = kmalloc(sizeof (*ske), lflags);
+ if (ske == NULL)
+ return (-ENOMEM);
+
+ ske->ske_obj = __get_free_pages(lflags, order);
+ if (ske->ske_obj == 0) {
+ kfree(ske);
+ return (-ENOMEM);
+ }
+
+ spin_lock(&skc->skc_lock);
+ empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+ if (likely(empty)) {
+ skc->skc_obj_total++;
+ skc->skc_obj_emergency++;
+ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+ skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (unlikely(!empty)) {
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+ return (-EINVAL);
+ }
+
+ *obj = (void *)ske->ske_obj;
+
+ return (0);
+}
+
+/*
+ * Locate the passed object in the red black tree and free it.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+
+ spin_lock(&skc->skc_lock);
+ ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+ if (ske) {
+ rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+ skc->skc_obj_emergency--;
+ skc->skc_obj_total--;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (ske == NULL)
+ return (-ENOENT);
+
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+
+ return (0);
+}
+
+/*
+ * Release objects from the per-cpu magazine back to their slab. The flush
+ * argument contains the max number of entries to remove from the magazine.
+ */
+static void
+__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ int i, count = MIN(flush, skm->skm_avail);
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ for (i = 0; i < count; i++)
+ spl_cache_shrink(skc, skm->skm_objs[i]);
+
+ skm->skm_avail -= count;
+ memmove(skm->skm_objs, &(skm->skm_objs[count]),
+ sizeof (void *) * skm->skm_avail);
+}
+
+static void
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ spin_lock(&skc->skc_lock);
+ __spl_cache_flush(skc, skm, flush);
+ spin_unlock(&skc->skc_lock);
+}
+
+static void
+spl_magazine_age(void *data)
+{
+ spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+ spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_cpu == smp_processor_id());
+ ASSERT(irqs_disabled());
+
+ /* There are no available objects or they are too young to age out */
+ if ((skm->skm_avail == 0) ||
+ time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
+ return;
+
+ /*
+ * Because we're executing in interrupt context we may have
+ * interrupted the holder of this lock. To avoid a potential
+ * deadlock return if the lock is contended.
+ */
+ if (!spin_trylock(&skc->skc_lock))
+ return;
+
+ __spl_cache_flush(skc, skm, skm->skm_refill);
+ spin_unlock(&skc->skc_lock);
+}
+
+/*
+ * Called regularly to keep a downward pressure on the cache.
+ *
+ * Objects older than skc->skc_delay seconds in the per-cpu magazines will
+ * be returned to the caches. This is done to prevent idle magazines from
+ * holding memory which could be better used elsewhere. The delay is
+ * present to prevent thrashing the magazine.
+ *
+ * The newly released objects may result in empty partial slabs. Those
+ * slabs should be released to the system. Otherwise moving the objects
+ * out of the magazines is just wasted work.
+ */
+static void
+spl_cache_age(void *data)
+{
+ spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+ taskqid_t id = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /* Dynamically disabled at run time */
+ if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
+ return;
+
+ atomic_inc(&skc->skc_ref);
+
+ if (!(skc->skc_flags & KMC_NOMAGAZINE))
+ on_each_cpu(spl_magazine_age, skc, 1);
+
+ spl_slab_reclaim(skc);
+
+ while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
+ id = taskq_dispatch_delay(
+ spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
+ ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+ /* Destroy issued after dispatch immediately cancel it */
+ if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+ }
+
+ spin_lock(&skc->skc_lock);
+ skc->skc_taskqid = id;
+ spin_unlock(&skc->skc_lock);
+
+ atomic_dec(&skc->skc_ref);
+}
+
+/*
+ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
+ * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page. Also for
+ * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
+ * lower than this and we will fail.
+ */
+static int
+spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
+{
+ uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
+
+ if (skc->skc_flags & KMC_OFFSLAB) {
+ tgt_objs = spl_kmem_cache_obj_per_slab;
+ tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
+
+ if ((skc->skc_flags & KMC_KMEM) &&
+ (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE)))
+ return (-ENOSPC);
+ } else {
+ sks_size = spl_sks_size(skc);
+ obj_size = spl_obj_size(skc);
+ max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+ tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
+
+ /*
+ * KMC_KMEM slabs are allocated by __get_free_pages() which
+ * rounds up to the nearest order. Knowing this the size
+ * should be rounded up to the next power of two with a hard
+ * maximum defined by the maximum allowed allocation order.
+ */
+ if (skc->skc_flags & KMC_KMEM) {
+ max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
+ tgt_size = MIN(max_size,
+ PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
+ }
+
+ if (tgt_size <= max_size) {
+ tgt_objs = (tgt_size - sks_size) / obj_size;
+ } else {
+ tgt_objs = (max_size - sks_size) / obj_size;
+ tgt_size = (tgt_objs * obj_size) + sks_size;
+ }
+ }
+
+ if (tgt_objs == 0)
+ return (-ENOSPC);
+
+ *objs = tgt_objs;
+ *size = tgt_size;
+
+ return (0);
+}
+
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine. Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+ uint32_t obj_size = spl_obj_size(skc);
+ int size;
+
+ if (spl_kmem_cache_magazine_size > 0)
+ return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
+ /* Per-magazine sizes below assume a 4Kib page size */
+ if (obj_size > (PAGE_SIZE * 256))
+ size = 4; /* Minimum 4Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE * 32))
+ size = 16; /* Minimum 2Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE))
+ size = 64; /* Minimum 256Kib per-magazine */
+ else if (obj_size > (PAGE_SIZE / 4))
+ size = 128; /* Minimum 128Kib per-magazine */
+ else
+ size = 256;
+
+ return (size);
+}
+
+/*
+ * Allocate a per-cpu magazine to associate with a specific core.
+ */
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
+{
+ spl_kmem_magazine_t *skm;
+ int size = sizeof (spl_kmem_magazine_t) +
+ sizeof (void *) * skc->skc_mag_size;
+
+ skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (skm) {
+ skm->skm_magic = SKM_MAGIC;
+ skm->skm_avail = 0;
+ skm->skm_size = skc->skc_mag_size;
+ skm->skm_refill = skc->skc_mag_refill;
+ skm->skm_cache = skc;
+ skm->skm_age = jiffies;
+ skm->skm_cpu = cpu;
+ }
+
+ return (skm);
+}
+
+/*
+ * Free a per-cpu magazine associated with a specific core.
+ */
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_avail == 0);
+ kfree(skm);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+ int i;
+
+ if (skc->skc_flags & KMC_NOMAGAZINE)
+ return (0);
+
+ skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+ num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
+ skc->skc_mag_size = spl_magazine_size(skc);
+ skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+
+ for_each_possible_cpu(i) {
+ skc->skc_mag[i] = spl_magazine_alloc(skc, i);
+ if (!skc->skc_mag[i]) {
+ for (i--; i >= 0; i--)
+ spl_magazine_free(skc->skc_mag[i]);
+
+ kfree(skc->skc_mag);
+ return (-ENOMEM);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+ spl_kmem_magazine_t *skm;
+ int i;
+
+ if (skc->skc_flags & KMC_NOMAGAZINE)
+ return;
+
+ for_each_possible_cpu(i) {
+ skm = skc->skc_mag[i];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ spl_magazine_free(skm);
+ }
+
+ kfree(skc->skc_mag);
+}
+
+/*
+ * Create a object cache based on the following arguments:
+ * name cache name
+ * size cache object size
+ * align cache object alignment
+ * ctor cache object constructor
+ * dtor cache object destructor
+ * reclaim cache object reclaim
+ * priv cache private data for ctor/dtor/reclaim
+ * vmp unused must be NULL
+ * flags
+ * KMC_NOTOUCH Disable cache object aging (unsupported)
+ * KMC_NODEBUG Disable debugging (unsupported)
+ * KMC_NOHASH Disable hashing (unsupported)
+ * KMC_QCACHE Disable qcache (unsupported)
+ * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab
+ * KMC_KMEM Force kmem backed cache
+ * KMC_VMEM Force vmem backed cache
+ * KMC_SLAB Force Linux slab backed cache
+ * KMC_OFFSLAB Locate objects off the slab
+ */
+spl_kmem_cache_t *
+spl_kmem_cache_create(char *name, size_t size, size_t align,
+ spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
+ void *priv, void *vmp, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+ spl_kmem_cache_t *skc;
+ int rc;
+
+ /*
+ * Unsupported flags
+ */
+ ASSERT0(flags & KMC_NOMAGAZINE);
+ ASSERT0(flags & KMC_NOHASH);
+ ASSERT0(flags & KMC_QCACHE);
+ ASSERT(vmp == NULL);
+
+ might_sleep();
+
+ skc = kzalloc(sizeof (*skc), lflags);
+ if (skc == NULL)
+ return (NULL);
+
+ skc->skc_magic = SKC_MAGIC;
+ skc->skc_name_size = strlen(name) + 1;
+ skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+ if (skc->skc_name == NULL) {
+ kfree(skc);
+ return (NULL);
+ }
+ strncpy(skc->skc_name, name, skc->skc_name_size);
+
+ skc->skc_ctor = ctor;
+ skc->skc_dtor = dtor;
+ skc->skc_reclaim = reclaim;
+ skc->skc_private = priv;
+ skc->skc_vmp = vmp;
+ skc->skc_linux_cache = NULL;
+ skc->skc_flags = flags;
+ skc->skc_obj_size = size;
+ skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
+ skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+ skc->skc_reap = SPL_KMEM_CACHE_REAP;
+ atomic_set(&skc->skc_ref, 0);
+
+ INIT_LIST_HEAD(&skc->skc_list);
+ INIT_LIST_HEAD(&skc->skc_complete_list);
+ INIT_LIST_HEAD(&skc->skc_partial_list);
+ skc->skc_emergency_tree = RB_ROOT;
+ spin_lock_init(&skc->skc_lock);
+ init_waitqueue_head(&skc->skc_waitq);
+ skc->skc_slab_fail = 0;
+ skc->skc_slab_create = 0;
+ skc->skc_slab_destroy = 0;
+ skc->skc_slab_total = 0;
+ skc->skc_slab_alloc = 0;
+ skc->skc_slab_max = 0;
+ skc->skc_obj_total = 0;
+ skc->skc_obj_alloc = 0;
+ skc->skc_obj_max = 0;
+ skc->skc_obj_deadlock = 0;
+ skc->skc_obj_emergency = 0;
+ skc->skc_obj_emergency_max = 0;
+
+ /*
+ * Verify the requested alignment restriction is sane.
+ */
+ if (align) {
+ VERIFY(ISP2(align));
+ VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
+ VERIFY3U(align, <=, PAGE_SIZE);
+ skc->skc_obj_align = align;
+ }
+
+ /*
+ * When no specific type of slab is requested (kmem, vmem, or
+ * linuxslab) then select a cache type based on the object size
+ * and default tunables.
+ */
+ if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
+
+ /*
+ * Objects smaller than spl_kmem_cache_slab_limit can
+ * use the Linux slab for better space-efficiency. By
+ * default this functionality is disabled until its
+ * performance characteristics are fully understood.
+ */
+ if (spl_kmem_cache_slab_limit &&
+ size <= (size_t)spl_kmem_cache_slab_limit)
+ skc->skc_flags |= KMC_SLAB;
+
+ /*
+ * Small objects, less than spl_kmem_cache_kmem_limit per
+ * object should use kmem because their slabs are small.
+ */
+ else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit)
+ skc->skc_flags |= KMC_KMEM;
+
+ /*
+ * All other objects are considered large and are placed
+ * on vmem backed slabs.
+ */
+ else
+ skc->skc_flags |= KMC_VMEM;
+ }
+
+ /*
+ * Given the type of slab allocate the required resources.
+ */
+ if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
+ rc = spl_slab_size(skc,
+ &skc->skc_slab_objs, &skc->skc_slab_size);
+ if (rc)
+ goto out;
+
+ rc = spl_magazine_create(skc);
+ if (rc)
+ goto out;
+ } else {
+ unsigned long slabflags = 0;
+
+ if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+ rc = EINVAL;
+ goto out;
+ }
+
+#if defined(SLAB_USERCOPY)
+ /*
+ * Required for PAX-enabled kernels if the slab is to be
+ * used for coping between user and kernel space.
+ */
+ slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+ /*
+ * Newer grsec patchset uses kmem_cache_create_usercopy()
+ * instead of SLAB_USERCOPY flag
+ */
+ skc->skc_linux_cache = kmem_cache_create_usercopy(
+ skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+ skc->skc_linux_cache = kmem_cache_create(
+ skc->skc_name, size, align, slabflags, NULL);
+#endif
+ if (skc->skc_linux_cache == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+
+#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
+ skc->skc_linux_cache->allocflags |= __GFP_COMP;
+#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
+ skc->skc_linux_cache->gfpflags |= __GFP_COMP;
+#endif
+ skc->skc_flags |= KMC_NOMAGAZINE;
+ }
+
+ if (spl_kmem_cache_expire & KMC_EXPIRE_AGE)
+ skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
+ spl_cache_age, skc, TQ_SLEEP,
+ ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+ down_write(&spl_kmem_cache_sem);
+ list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
+ up_write(&spl_kmem_cache_sem);
+
+ return (skc);
+out:
+ kfree(skc->skc_name);
+ kfree(skc);
+ return (NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_create);
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
+ kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+ ASSERT(move != NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_set_move);
+
+/*
+ * Destroy a cache and all objects associated with the cache.
+ */
+void
+spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
+{
+ DECLARE_WAIT_QUEUE_HEAD(wq);
+ taskqid_t id;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
+
+ down_write(&spl_kmem_cache_sem);
+ list_del_init(&skc->skc_list);
+ up_write(&spl_kmem_cache_sem);
+
+ /* Cancel any and wait for any pending delayed tasks */
+ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ spin_lock(&skc->skc_lock);
+ id = skc->skc_taskqid;
+ spin_unlock(&skc->skc_lock);
+
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+
+ /*
+ * Wait until all current callers complete, this is mainly
+ * to catch the case where a low memory situation triggers a
+ * cache reaping action which races with this destroy.
+ */
+ wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
+ if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
+ spl_magazine_destroy(skc);
+ spl_slab_reclaim(skc);
+ } else {
+ ASSERT(skc->skc_flags & KMC_SLAB);
+ kmem_cache_destroy(skc->skc_linux_cache);
+ }
+
+ spin_lock(&skc->skc_lock);
+
+ /*
+ * Validate there are no objects in use and free all the
+ * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+ */
+ ASSERT3U(skc->skc_slab_alloc, ==, 0);
+ ASSERT3U(skc->skc_obj_alloc, ==, 0);
+ ASSERT3U(skc->skc_slab_total, ==, 0);
+ ASSERT3U(skc->skc_obj_total, ==, 0);
+ ASSERT3U(skc->skc_obj_emergency, ==, 0);
+ ASSERT(list_empty(&skc->skc_complete_list));
+
+ spin_unlock(&skc->skc_lock);
+
+ kfree(skc->skc_name);
+ kfree(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_destroy);
+
+/*
+ * Allocate an object from a slab attached to the cache. This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+ spl_kmem_obj_t *sko;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+
+ sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ ASSERT(sko->sko_addr != NULL);
+
+ /* Remove from sks_free_list */
+ list_del_init(&sko->sko_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref++;
+ skc->skc_obj_alloc++;
+
+ /* Track max obj usage statistics */
+ if (skc->skc_obj_alloc > skc->skc_obj_max)
+ skc->skc_obj_max = skc->skc_obj_alloc;
+
+ /* Track max slab usage statistics */
+ if (sks->sks_ref == 1) {
+ skc->skc_slab_alloc++;
+
+ if (skc->skc_slab_alloc > skc->skc_slab_max)
+ skc->skc_slab_max = skc->skc_slab_alloc;
+ }
+
+ return (sko->sko_addr);
+}
+
+/*
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
+ */
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ sks = spl_slab_alloc(skc, flags);
+ spl_fstrans_unmark(cookie);
+
+ spin_lock(&skc->skc_lock);
+ if (sks) {
+ skc->skc_slab_total++;
+ skc->skc_obj_total += sks->sks_objs;
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+ spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+ spl_kmem_cache_t *skc = ska->ska_cache;
+
+ (void) __spl_cache_grow(skc, ska->ska_flags);
+
+ atomic_dec(&skc->skc_ref);
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+
+ kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+ return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
+}
+
+/*
+ * No available objects on any slabs, create a new slab. Note that this
+ * functionality is disabled for KMC_SLAB caches which are backed by the
+ * Linux slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ int remaining, rc = 0;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ might_sleep();
+ *obj = NULL;
+
+ /*
+ * Before allocating a new slab wait for any reaping to complete and
+ * then return so the local magazine can be rechecked for new objects.
+ */
+ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+ rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+ TASK_UNINTERRUPTIBLE);
+ return (rc ? rc : -EAGAIN);
+ }
+
+ /*
+ * To reduce the overhead of context switch and improve NUMA locality,
+ * it tries to allocate a new slab in the current process context with
+ * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
+ * allocation.
+ *
+ * However, this can't be applied to KVM_VMEM due to a bug that
+ * __vmalloc() doesn't honor gfp flags in page table allocation.
+ */
+ if (!(skc->skc_flags & KMC_VMEM)) {
+ rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
+ if (rc == 0)
+ return (0);
+ }
+
+ /*
+ * This is handled by dispatching a work request to the global work
+ * queue. This allows us to asynchronously allocate a new slab while
+ * retaining the ability to safely fall back to a smaller synchronous
+ * allocations to ensure forward progress is always maintained.
+ */
+ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+ spl_kmem_alloc_t *ska;
+
+ ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
+ if (ska == NULL) {
+ clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ return (-ENOMEM);
+ }
+
+ atomic_inc(&skc->skc_ref);
+ ska->ska_cache = skc;
+ ska->ska_flags = flags;
+ taskq_init_ent(&ska->ska_tqe);
+ taskq_dispatch_ent(spl_kmem_cache_taskq,
+ spl_cache_grow_work, ska, 0, &ska->ska_tqe);
+ }
+
+ /*
+ * The goal here is to only detect the rare case where a virtual slab
+ * allocation has deadlocked. We must be careful to minimize the use
+ * of emergency objects which are more expensive to track. Therefore,
+ * we set a very long timeout for the asynchronous allocation and if
+ * the timeout is reached the cache is flagged as deadlocked. From
+ * this point only new emergency objects will be allocated until the
+ * asynchronous allocation completes and clears the deadlocked flag.
+ */
+ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+ rc = spl_emergency_alloc(skc, flags, obj);
+ } else {
+ remaining = wait_event_timeout(skc->skc_waitq,
+ spl_cache_grow_wait(skc), HZ / 10);
+
+ if (!remaining) {
+ spin_lock(&skc->skc_lock);
+ if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+ set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ skc->skc_obj_deadlock++;
+ }
+ spin_unlock(&skc->skc_lock);
+ }
+
+ rc = -ENOMEM;
+ }
+
+ return (rc);
+}
+
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created. On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
+ */
+static void *
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+ spl_kmem_slab_t *sks;
+ int count = 0, rc, refill;
+ void *obj = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
+ spin_lock(&skc->skc_lock);
+
+ while (refill > 0) {
+ /* No slabs available we may need to grow the cache */
+ if (list_empty(&skc->skc_partial_list)) {
+ spin_unlock(&skc->skc_lock);
+
+ local_irq_enable();
+ rc = spl_cache_grow(skc, flags, &obj);
+ local_irq_disable();
+
+ /* Emergency object for immediate use by caller */
+ if (rc == 0 && obj != NULL)
+ return (obj);
+
+ if (rc)
+ goto out;
+
+ /* Rescheduled to different CPU skm is not local */
+ if (skm != skc->skc_mag[smp_processor_id()])
+ goto out;
+
+ /*
+ * Potentially rescheduled to the same CPU but
+ * allocations may have occurred from this CPU while
+ * we were sleeping so recalculate max refill.
+ */
+ refill = MIN(refill, skm->skm_size - skm->skm_avail);
+
+ spin_lock(&skc->skc_lock);
+ continue;
+ }
+
+ /* Grab the next available slab */
+ sks = list_entry((&skc->skc_partial_list)->next,
+ spl_kmem_slab_t, sks_list);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref < sks->sks_objs);
+ ASSERT(!list_empty(&sks->sks_free_list));
+
+ /*
+ * Consume as many objects as needed to refill the requested
+ * cache. We must also be careful not to overfill it.
+ */
+ while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+ ++count) {
+ ASSERT(skm->skm_avail < skm->skm_size);
+ ASSERT(count < skm->skm_size);
+ skm->skm_objs[skm->skm_avail++] =
+ spl_cache_obj(skc, sks);
+ }
+
+ /* Move slab to skc_complete_list when full */
+ if (sks->sks_ref == sks->sks_objs) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_complete_list);
+ }
+ }
+
+ spin_unlock(&skc->skc_lock);
+out:
+ return (NULL);
+}
+
+/*
+ * Release an object back to the slab from which it came.
+ */
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_slab_t *sks = NULL;
+ spl_kmem_obj_t *sko = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ sko = spl_sko_from_obj(skc, obj);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ sks = sko->sko_slab;
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_cache == skc);
+ list_add(&sko->sko_list, &sks->sks_free_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref--;
+ skc->skc_obj_alloc--;
+
+ /*
+ * Move slab to skc_partial_list when no longer full. Slabs
+ * are added to the head to keep the partial list is quasi-full
+ * sorted order. Fuller at the head, emptier at the tail.
+ */
+ if (sks->sks_ref == (sks->sks_objs - 1)) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_partial_list);
+ }
+
+ /*
+ * Move empty slabs to the end of the partial list so
+ * they can be easily found and freed during reclamation.
+ */
+ if (sks->sks_ref == 0) {
+ list_del(&sks->sks_list);
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+ skc->skc_slab_alloc--;
+ }
+}
+
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_magazine_t *skm;
+ void *obj = NULL;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Allocate directly from a Linux slab. All optimizations are left
+ * to the underlying cache we only need to guarantee that KM_SLEEP
+ * callers will never fail.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ struct kmem_cache *slc = skc->skc_linux_cache;
+ do {
+ obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
+ } while ((obj == NULL) && !(flags & KM_NOSLEEP));
+
+ goto ret;
+ }
+
+ local_irq_disable();
+
+restart:
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * in the restart case we must be careful to reacquire
+ * the local magazine since this may have changed
+ * when we need to grow the cache.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ if (likely(skm->skm_avail)) {
+ /* Object available in CPU cache, use it */
+ obj = skm->skm_objs[--skm->skm_avail];
+ skm->skm_age = jiffies;
+ } else {
+ obj = spl_cache_refill(skc, skm, flags);
+ if ((obj == NULL) && !(flags & KM_NOSLEEP))
+ goto restart;
+
+ local_irq_enable();
+ goto ret;
+ }
+
+ local_irq_enable();
+ ASSERT(obj);
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+
+ret:
+ /* Pre-emptively migrate object to CPU L1 cache */
+ if (obj) {
+ if (obj && skc->skc_ctor)
+ skc->skc_ctor(obj, skc->skc_private, flags);
+ else
+ prefetchw(obj);
+ }
+
+ return (obj);
+}
+EXPORT_SYMBOL(spl_kmem_cache_alloc);
+
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from. We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
+void
+spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_magazine_t *skm;
+ unsigned long flags;
+ int do_reclaim = 0;
+ int do_emergency = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Run the destructor
+ */
+ if (skc->skc_dtor)
+ skc->skc_dtor(obj, skc->skc_private);
+
+ /*
+ * Free the object from the Linux underlying Linux slab.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ kmem_cache_free(skc->skc_linux_cache, obj);
+ return;
+ }
+
+ /*
+ * While a cache has outstanding emergency objects all freed objects
+ * must be checked. However, since emergency objects will never use
+ * a virtual address these objects can be safely excluded as an
+ * optimization.
+ */
+ if (!is_vmalloc_addr(obj)) {
+ spin_lock(&skc->skc_lock);
+ do_emergency = (skc->skc_obj_emergency > 0);
+ spin_unlock(&skc->skc_lock);
+
+ if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+ return;
+ }
+
+ local_irq_save(flags);
+
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * no remote memory allocation tracking is being performed
+ * it is entirely possible to allocate an object from one
+ * CPU cache and return it to another.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ /*
+ * Per-CPU cache full, flush it to make space for this object,
+ * this may result in an empty slab which can be reclaimed once
+ * interrupts are re-enabled.
+ */
+ if (unlikely(skm->skm_avail >= skm->skm_size)) {
+ spl_cache_flush(skc, skm, skm->skm_refill);
+ do_reclaim = 1;
+ }
+
+ /* Available space in cache, use it */
+ skm->skm_objs[skm->skm_avail++] = obj;
+
+ local_irq_restore(flags);
+
+ if (do_reclaim)
+ spl_slab_reclaim(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_free);
+
+/*
+ * The generic shrinker function for all caches. Under Linux a shrinker
+ * may not be tightly coupled with a slab cache. In fact Linux always
+ * systematically tries calling all registered shrinker callbacks which
+ * report that they contain unused objects. Because of this we only
+ * register one shrinker function in the shim layer for all slab caches.
+ * We always attempt to shrink all caches when this generic shrinker
+ * is called.
+ *
+ * If sc->nr_to_scan is zero, the caller is requesting a query of the
+ * number of objects which can potentially be freed. If it is nonzero,
+ * the request is to free that many objects.
+ *
+ * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
+ * in struct shrinker and also require the shrinker to return the number
+ * of objects freed.
+ *
+ * Older kernels require the shrinker to return the number of freeable
+ * objects following the freeing of nr_to_free.
+ *
+ * Linux semantics differ from those under Solaris, which are to
+ * free all available objects which may (and probably will) be more
+ * objects than the requested nr_to_scan.
+ */
+static spl_shrinker_t
+__spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ spl_kmem_cache_t *skc;
+ int alloc = 0;
+
+ /*
+ * No shrinking in a transaction context. Can cause deadlocks.
+ */
+ if (sc->nr_to_scan && spl_fstrans_check())
+ return (SHRINK_STOP);
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ if (sc->nr_to_scan) {
+#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
+ uint64_t oldalloc = skc->skc_obj_alloc;
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
+ if (oldalloc > skc->skc_obj_alloc)
+ alloc += oldalloc - skc->skc_obj_alloc;
+#else
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
+ alloc += skc->skc_obj_alloc;
+#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
+ } else {
+ /* Request to query number of freeable objects */
+ alloc += skc->skc_obj_alloc;
+ }
+ }
+ up_read(&spl_kmem_cache_sem);
+
+ /*
+ * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
+ * This functionality only exists to work around a rare issue where
+ * shrink_slabs() is repeatedly invoked by many cores causing the
+ * system to thrash.
+ */
+ if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
+ return (SHRINK_STOP);
+
+ return (MAX(alloc, 0));
+}
+
+SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
+
+/*
+ * Call the registered reclaim function for a cache. Depending on how
+ * many and which objects are released it may simply repopulate the
+ * local magazine which will then need to age-out. Objects which cannot
+ * fit in the magazine we will be released back to their slabs which will
+ * also need to age out before being release. This is all just best
+ * effort and we do not want to thrash creating and destroying slabs.
+ */
+void
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
+{
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ atomic_inc(&skc->skc_ref);
+
+ /*
+ * Execute the registered reclaim callback if it exists.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ if (skc->skc_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+ goto out;
+ }
+
+ /*
+ * Prevent concurrent cache reaping when contended.
+ */
+ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
+ goto out;
+
+ /*
+ * When a reclaim function is available it may be invoked repeatedly
+ * until at least a single slab can be freed. This ensures that we
+ * do free memory back to the system. This helps minimize the chance
+ * of an OOM event when the bulk of memory is used by the slab.
+ *
+ * When free slabs are already available the reclaim callback will be
+ * skipped. Additionally, if no forward progress is detected despite
+ * a reclaim function the cache will be skipped to avoid deadlock.
+ *
+ * Longer term this would be the correct place to add the code which
+ * repacks the slabs in order minimize fragmentation.
+ */
+ if (skc->skc_reclaim) {
+ uint64_t objects = UINT64_MAX;
+ int do_reclaim;
+
+ do {
+ spin_lock(&skc->skc_lock);
+ do_reclaim =
+ (skc->skc_slab_total > 0) &&
+ ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
+ (skc->skc_obj_alloc < objects);
+
+ objects = skc->skc_obj_alloc;
+ spin_unlock(&skc->skc_lock);
+
+ if (do_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+
+ } while (do_reclaim);
+ }
+
+ /* Reclaim from the magazine and free all now empty slabs. */
+ if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
+ spl_kmem_magazine_t *skm;
+ unsigned long irq_flags;
+
+ local_irq_save(irq_flags);
+ skm = skc->skc_mag[smp_processor_id()];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ local_irq_restore(irq_flags);
+ }
+
+ spl_slab_reclaim(skc);
+ clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+out:
+ atomic_dec(&skc->skc_ref);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_now);
+
+/*
+ * Reap all free slabs from all registered caches.
+ */
+void
+spl_kmem_reap(void)
+{
+ struct shrink_control sc;
+
+ sc.nr_to_scan = KMC_REAP_CHUNK;
+ sc.gfp_mask = GFP_KERNEL;
+
+ (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
+}
+EXPORT_SYMBOL(spl_kmem_reap);
+
+int
+spl_kmem_cache_init(void)
+{
+ init_rwsem(&spl_kmem_cache_sem);
+ INIT_LIST_HEAD(&spl_kmem_cache_list);
+ spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+ spl_kmem_cache_kmem_threads, maxclsyspri,
+ spl_kmem_cache_kmem_threads * 8, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ spl_register_shrinker(&spl_kmem_cache_shrinker);
+
+ return (0);
+}
+
+void
+spl_kmem_cache_fini(void)
+{
+ spl_unregister_shrinker(&spl_kmem_cache_shrinker);
+ taskq_destroy(spl_kmem_cache_taskq);
+}
diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
new file mode 100644
index 000000000..e0d551041
--- /dev/null
+++ b/module/spl/spl-kmem.c
@@ -0,0 +1,567 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <linux/mm.h>
+#include <linux/ratelimit.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous. Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to sixteen pages but capped at 64K to
+ * accommodate systems using large pages. This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern. Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught. These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+ "Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages. Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+ * will quickly fail. Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+ "Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
+/* END CSTYLED */
+
+int
+kmem_debugging(void)
+{
+ return (0);
+}
+EXPORT_SYMBOL(kmem_debugging);
+
+char *
+kmem_vasprintf(const char *fmt, va_list ap)
+{
+ va_list aq;
+ char *ptr;
+
+ do {
+ va_copy(aq, ap);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
+ va_end(aq);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_vasprintf);
+
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ va_list ap;
+ char *ptr;
+
+ do {
+ va_start(ap, fmt);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
+ va_end(ap);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_asprintf);
+
+static char *
+__strdup(const char *str, int flags)
+{
+ char *ptr;
+ int n;
+
+ n = strlen(str);
+ ptr = kmalloc(n + 1, kmem_flags_convert(flags));
+ if (ptr)
+ memcpy(ptr, str, n + 1);
+
+ return (ptr);
+}
+
+char *
+strdup(const char *str)
+{
+ return (__strdup(str, KM_SLEEP));
+}
+EXPORT_SYMBOL(strdup);
+
+void
+strfree(char *str)
+{
+ kfree(str);
+}
+EXPORT_SYMBOL(strfree);
+
+/*
+ * Limit the number of large allocation stack traces dumped to not more than
+ * 5 every 60 seconds to prevent denial-of-service attacks from debug code.
+ */
+DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5);
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable. Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ int use_vmem = 0;
+ void *ptr;
+
+ /*
+ * Log abnormally large allocations and rate limit the console output.
+ * Allocations larger than spl_kmem_alloc_warn should be performed
+ * through the vmem_alloc()/vmem_zalloc() interfaces.
+ */
+ if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+ !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) {
+ printk(KERN_WARNING
+ "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+ "https://github.com/zfsonlinux/zfs/issues/new\n",
+ (unsigned long)size, flags);
+ dump_stack();
+ }
+
+ /*
+ * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+ * unlike kmem_alloc() with KM_SLEEP on Illumos.
+ */
+ do {
+ /*
+ * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+ * is unsafe. This must fail for all for kmem_alloc() and
+ * kmem_zalloc() callers.
+ *
+ * For vmem_alloc() and vmem_zalloc() callers it is permissible
+ * to use __vmalloc(). However, in general use of __vmalloc()
+ * is strongly discouraged because a global lock must be
+ * acquired. Contention on this lock can significantly
+ * impact performance so frequently manipulating the virtual
+ * address space is strongly discouraged.
+ */
+ if ((size > spl_kmem_alloc_max) || use_vmem) {
+ if (flags & KM_VMEM) {
+ ptr = __vmalloc(size, lflags, PAGE_KERNEL);
+ } else {
+ return (NULL);
+ }
+ } else {
+ ptr = kmalloc_node(size, lflags, node);
+ }
+
+ if (likely(ptr) || (flags & KM_NOSLEEP))
+ return (ptr);
+
+ /*
+ * For vmem_alloc() and vmem_zalloc() callers retry immediately
+ * using __vmalloc() which is unlikely to fail.
+ */
+ if ((flags & KM_VMEM) && (use_vmem == 0)) {
+ use_vmem = 1;
+ continue;
+ }
+
+ if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) {
+ printk(KERN_WARNING
+ "Possible memory allocation deadlock: "
+ "size=%lu lflags=0x%x",
+ (unsigned long)size, lflags);
+ dump_stack();
+ }
+
+ /*
+ * Use cond_resched() instead of congestion_wait() to avoid
+ * deadlocking systems where there are no block devices.
+ */
+ cond_resched();
+ } while (1);
+
+ return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+ if (is_vmalloc_addr(buf))
+ vfree(buf);
+ else
+ kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations. When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
+ */
+#ifdef DEBUG_KMEM
+
+/* Shim layer memory accounting */
+#ifdef HAVE_ATOMIC64_T
+atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#else /* HAVE_ATOMIC64_T */
+atomic_t kmem_alloc_used = ATOMIC_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#endif /* HAVE_ATOMIC64_T */
+
+EXPORT_SYMBOL(kmem_alloc_used);
+EXPORT_SYMBOL(kmem_alloc_max);
+
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+ void *ptr;
+
+ ptr = spl_kmem_alloc_impl(size, flags, node);
+ if (ptr) {
+ kmem_alloc_used_add(size);
+ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+ kmem_alloc_max = kmem_alloc_used_read();
+ }
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+ kmem_alloc_used_sub(size);
+ spl_kmem_free_impl(ptr, size);
+}
+
+/*
+ * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
+ * but also the location of every alloc and free. When the SPL module is
+ * unloaded a list of all leaked addresses and where they were allocated
+ * will be dumped to the console. Enabling this feature has a significant
+ * impact on performance but it makes finding memory leaks straight forward.
+ *
+ * Not surprisingly with debugging enabled the xmem_locks are very highly
+ * contended particularly on xfree(). If we want to run with this detailed
+ * debugging enabled for anything other than debugging we need to minimize
+ * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
+ */
+#ifdef DEBUG_KMEM_TRACKING
+
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
+#define KMEM_HASH_BITS 10
+#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
+
+typedef struct kmem_debug {
+ struct hlist_node kd_hlist; /* Hash node linkage */
+ struct list_head kd_list; /* List of all allocations */
+ void *kd_addr; /* Allocation pointer */
+ size_t kd_size; /* Allocation size */
+ const char *kd_func; /* Allocation function */
+ int kd_line; /* Allocation line */
+} kmem_debug_t;
+
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
+
+static kmem_debug_t *
+kmem_del_init(spinlock_t *lock, struct hlist_head *table,
+ int bits, const void *addr)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kmem_debug *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(lock, flags);
+
+ head = &table[hash_ptr((void *)addr, bits)];
+ hlist_for_each(node, head) {
+ p = list_entry(node, struct kmem_debug, kd_hlist);
+ if (p->kd_addr == addr) {
+ hlist_del_init(&p->kd_hlist);
+ list_del_init(&p->kd_list);
+ spin_unlock_irqrestore(lock, flags);
+ return (p);
+ }
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+
+ return (NULL);
+}
+
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node)
+{
+ void *ptr = NULL;
+ kmem_debug_t *dptr;
+ unsigned long irq_flags;
+
+ dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+ if (dptr == NULL)
+ return (NULL);
+
+ dptr->kd_func = __strdup(func, flags);
+ if (dptr->kd_func == NULL) {
+ kfree(dptr);
+ return (NULL);
+ }
+
+ ptr = spl_kmem_alloc_debug(size, flags, node);
+ if (ptr == NULL) {
+ kfree(dptr->kd_func);
+ kfree(dptr);
+ return (NULL);
+ }
+
+ INIT_HLIST_NODE(&dptr->kd_hlist);
+ INIT_LIST_HEAD(&dptr->kd_list);
+
+ dptr->kd_addr = ptr;
+ dptr->kd_size = size;
+ dptr->kd_line = line;
+
+ spin_lock_irqsave(&kmem_lock, irq_flags);
+ hlist_add_head(&dptr->kd_hlist,
+ &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+ list_add_tail(&dptr->kd_list, &kmem_list);
+ spin_unlock_irqrestore(&kmem_lock, irq_flags);
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
+{
+ kmem_debug_t *dptr;
+
+ /* Ignore NULL pointer since we haven't tracked it at all */
+ if (ptr == NULL)
+ return;
+
+ /* Must exist in hash due to kmem_alloc() */
+ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
+ ASSERT3P(dptr, !=, NULL);
+ ASSERT3S(dptr->kd_size, ==, size);
+
+ kfree(dptr->kd_func);
+ kfree(dptr);
+
+ spl_kmem_free_debug(ptr, size);
+}
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
+void *
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
+
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_ZERO;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_zalloc);
+
+void
+spl_kmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_free);
+
+#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
+static char *
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+{
+ int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
+ int i, flag = 1;
+
+ ASSERT(str != NULL && len >= 17);
+ memset(str, 0, len);
+
+ /*
+ * Check for a fully printable string, and while we are at
+ * it place the printable characters in the passed buffer.
+ */
+ for (i = 0; i < size; i++) {
+ str[i] = ((char *)(kd->kd_addr))[i];
+ if (isprint(str[i])) {
+ continue;
+ } else {
+ /*
+ * Minimum number of printable characters found
+ * to make it worthwhile to print this as ascii.
+ */
+ if (i > min)
+ break;
+
+ flag = 0;
+ break;
+ }
+ }
+
+ if (!flag) {
+ sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
+ *((uint8_t *)kd->kd_addr),
+ *((uint8_t *)kd->kd_addr + 2),
+ *((uint8_t *)kd->kd_addr + 4),
+ *((uint8_t *)kd->kd_addr + 6),
+ *((uint8_t *)kd->kd_addr + 8),
+ *((uint8_t *)kd->kd_addr + 10),
+ *((uint8_t *)kd->kd_addr + 12),
+ *((uint8_t *)kd->kd_addr + 14));
+ }
+
+ return (str);
+}
+
+static int
+spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
+{
+ int i;
+
+ spin_lock_init(lock);
+ INIT_LIST_HEAD(list);
+
+ for (i = 0; i < size; i++)
+ INIT_HLIST_HEAD(&kmem_table[i]);
+
+ return (0);
+}
+
+static void
+spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+{
+ unsigned long flags;
+ kmem_debug_t *kd;
+ char str[17];
+
+ spin_lock_irqsave(lock, flags);
+ if (!list_empty(list))
+ printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
+ "size", "data", "func", "line");
+
+ list_for_each_entry(kd, list, kd_list) {
+ printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
+ (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
+ kd->kd_func, kd->kd_line);
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+}
+#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+
+int
+spl_kmem_init(void)
+{
+#ifdef DEBUG_KMEM
+ kmem_alloc_used_set(0);
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+ return (0);
+}
+
+void
+spl_kmem_fini(void)
+{
+#ifdef DEBUG_KMEM
+ /*
+ * Display all unreclaimed memory addresses, including the
+ * allocation size and the first few bytes of what's located
+ * at that address to aid in debugging. Performance is not
+ * a serious concern here since it is module unload time.
+ */
+ if (kmem_alloc_used_read() != 0)
+ printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
+ (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+}
diff --git a/module/spl/spl-kobj.c b/module/spl/spl-kobj.c
new file mode 100644
index 000000000..7019369bd
--- /dev/null
+++ b/module/spl/spl-kobj.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kobj Implementation.
+ */
+
+#include <sys/kobj.h>
+
+struct _buf *
+kobj_open_file(const char *name)
+{
+ struct _buf *file;
+ vnode_t *vp;
+ int rc;
+
+ file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP));
+ if (file == NULL)
+ return ((_buf_t *)-1UL);
+
+ if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) {
+ kfree(file);
+ return ((_buf_t *)-1UL);
+ }
+
+ file->vp = vp;
+
+ return (file);
+} /* kobj_open_file() */
+EXPORT_SYMBOL(kobj_open_file);
+
+void
+kobj_close_file(struct _buf *file)
+{
+ VOP_CLOSE(file->vp, 0, 0, 0, 0, 0);
+ kfree(file);
+} /* kobj_close_file() */
+EXPORT_SYMBOL(kobj_close_file);
+
+int
+kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
+{
+ ssize_t resid;
+
+ if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off,
+ UIO_SYSSPACE, 0, 0, 0, &resid) != 0)
+ return (-1);
+
+ return (size - resid);
+} /* kobj_read_file() */
+EXPORT_SYMBOL(kobj_read_file);
+
+int
+kobj_get_filesize(struct _buf *file, uint64_t *size)
+{
+ vattr_t vap;
+ int rc;
+
+ rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL);
+ if (rc)
+ return (rc);
+
+ *size = vap.va_size;
+
+ return (rc);
+} /* kobj_get_filesize() */
+EXPORT_SYMBOL(kobj_get_filesize);
diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c
new file mode 100644
index 000000000..bcbff94a6
--- /dev/null
+++ b/module/spl/spl-kstat.c
@@ -0,0 +1,733 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kstat Implementation.
+ */
+
+#include <linux/seq_file.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+#ifndef HAVE_PDE_DATA
+#define PDE_DATA(x) (PDE(x)->data)
+#endif
+
+static kmutex_t kstat_module_lock;
+static struct list_head kstat_module_list;
+static kid_t kstat_id;
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+ return (ENOMEM);
+
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+
+ return (0);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt++;
+ if (wcnt != 0) {
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_waitq_enter);
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt--;
+ ASSERT((int)wcnt > 0);
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+}
+EXPORT_SYMBOL(kstat_waitq_exit);
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt++;
+ if (rcnt != 0) {
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_runq_enter);
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt--;
+ ASSERT((int)rcnt > 0);
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+}
+EXPORT_SYMBOL(kstat_runq_exit);
+
+static int
+kstat_seq_show_headers(struct seq_file *f)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
+ ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
+ ksp->ks_ndata, (int)ksp->ks_data_size,
+ ksp->ks_crtime, ksp->ks_snaptime);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.headers) {
+ rc = ksp->ks_raw_ops.headers(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ seq_printf(f, "raw data\n");
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ seq_printf(f, "%-31s %-4s %s\n",
+ "name", "type", "data");
+ break;
+ case KSTAT_TYPE_INTR:
+ seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
+ "hard", "soft", "watchdog",
+ "spurious", "multsvc");
+ break;
+ case KSTAT_TYPE_IO:
+ seq_printf(f,
+ "%-8s %-8s %-8s %-8s %-8s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s %-8s\n",
+ "nread", "nwritten", "reads", "writes",
+ "wtime", "wlentime", "wupdate",
+ "rtime", "rlentime", "rupdate",
+ "wcnt", "rcnt");
+ break;
+ case KSTAT_TYPE_TIMER:
+ seq_printf(f,
+ "%-31s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s\n",
+ "name", "events", "elapsed",
+ "min", "max", "start", "stop");
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
+{
+ int i, j;
+
+ for (i = 0; ; i++) {
+ seq_printf(f, "%03x:", i);
+
+ for (j = 0; j < 16; j++) {
+ if (i * 16 + j >= l) {
+ seq_printf(f, "\n");
+ goto out;
+ }
+
+ seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
+ }
+ seq_printf(f, "\n");
+ }
+out:
+ return (0);
+}
+
+static int
+kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
+{
+ seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
+
+ switch (knp->data_type) {
+ case KSTAT_DATA_CHAR:
+ knp->value.c[15] = '\0'; /* NULL terminate */
+ seq_printf(f, "%-16s", knp->value.c);
+ break;
+ /*
+ * NOTE - We need to be more careful able what tokens are
+ * used for each arch, for now this is correct for x86_64.
+ */
+ case KSTAT_DATA_INT32:
+ seq_printf(f, "%d", knp->value.i32);
+ break;
+ case KSTAT_DATA_UINT32:
+ seq_printf(f, "%u", knp->value.ui32);
+ break;
+ case KSTAT_DATA_INT64:
+ seq_printf(f, "%lld", (signed long long)knp->value.i64);
+ break;
+ case KSTAT_DATA_UINT64:
+ seq_printf(f, "%llu",
+ (unsigned long long)knp->value.ui64);
+ break;
+ case KSTAT_DATA_LONG:
+ seq_printf(f, "%ld", knp->value.l);
+ break;
+ case KSTAT_DATA_ULONG:
+ seq_printf(f, "%lu", knp->value.ul);
+ break;
+ case KSTAT_DATA_STRING:
+ KSTAT_NAMED_STR_PTR(knp)
+ [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
+ seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
+ break;
+ default:
+ PANIC("Undefined kstat data type %d\n", knp->data_type);
+ }
+
+ seq_printf(f, "\n");
+
+ return (0);
+}
+
+static int
+kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
+{
+ seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
+ kip->intrs[KSTAT_INTR_HARD],
+ kip->intrs[KSTAT_INTR_SOFT],
+ kip->intrs[KSTAT_INTR_WATCHDOG],
+ kip->intrs[KSTAT_INTR_SPURIOUS],
+ kip->intrs[KSTAT_INTR_MULTSVC]);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
+{
+ seq_printf(f,
+ "%-8llu %-8llu %-8u %-8u %-8lld %-8lld "
+ "%-8lld %-8lld %-8lld %-8lld %-8u %-8u\n",
+ kip->nread, kip->nwritten,
+ kip->reads, kip->writes,
+ kip->wtime, kip->wlentime, kip->wlastupdate,
+ kip->rtime, kip->rlentime, kip->rlastupdate,
+ kip->wcnt, kip->rcnt);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
+{
+ seq_printf(f,
+ "%-31s %-8llu %-8lld %-8lld %-8lld %-8lld %-8lld\n",
+ ktp->name, ktp->num_events, ktp->elapsed_time,
+ ktp->min_time, ktp->max_time,
+ ktp->start_time, ktp->stop_time);
+
+ return (0);
+}
+
+static int
+kstat_seq_show(struct seq_file *f, void *p)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.data) {
+ rc = ksp->ks_raw_ops.data(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ ASSERT(ksp->ks_ndata == 1);
+ rc = kstat_seq_show_raw(f, ksp->ks_data,
+ ksp->ks_data_size);
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = kstat_seq_show_named(f, (kstat_named_t *)p);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = kstat_seq_show_io(f, (kstat_io_t *)p);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+ ASSERT(ksp != NULL);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+static void *
+kstat_seq_data_addr(kstat_t *ksp, loff_t n)
+{
+ void *rc = NULL;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ if (ksp->ks_raw_ops.addr)
+ rc = ksp->ks_raw_ops.addr(ksp, n);
+ else
+ rc = ksp->ks_data;
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = ksp->ks_data + n * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = ksp->ks_data + n * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = ksp->ks_data + n * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = ksp->ks_data + n * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (rc);
+}
+
+static void *
+kstat_seq_start(struct seq_file *f, loff_t *pos)
+{
+ loff_t n = *pos;
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW) {
+ ksp->ks_raw_bufsize = PAGE_SIZE;
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+ }
+
+ /* Dynamically update kstat, on error existing kstats are used */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+
+ ksp->ks_snaptime = gethrtime();
+
+ if (!n && kstat_seq_show_headers(f))
+ return (NULL);
+
+ if (n >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, n));
+}
+
+static void *
+kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ ++*pos;
+ if (*pos >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, *pos));
+}
+
+static void
+kstat_seq_stop(struct seq_file *f, void *v)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW)
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+
+ mutex_exit(ksp->ks_lock);
+}
+
+static struct seq_operations kstat_seq_ops = {
+ .show = kstat_seq_show,
+ .start = kstat_seq_start,
+ .next = kstat_seq_next,
+ .stop = kstat_seq_stop,
+};
+
+static kstat_module_t *
+kstat_find_module(char *name)
+{
+ kstat_module_t *module;
+
+ list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
+ if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
+ return (module);
+ }
+
+ return (NULL);
+}
+
+static kstat_module_t *
+kstat_create_module(char *name)
+{
+ kstat_module_t *module;
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(name, proc_spl_kstat);
+ if (pde == NULL)
+ return (NULL);
+
+ module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
+ module->ksm_proc = pde;
+ strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+ INIT_LIST_HEAD(&module->ksm_kstat_list);
+ list_add_tail(&module->ksm_module_list, &kstat_module_list);
+
+ return (module);
+
+}
+
+static void
+kstat_delete_module(kstat_module_t *module)
+{
+ ASSERT(list_empty(&module->ksm_kstat_list));
+ remove_proc_entry(module->ksm_name, proc_spl_kstat);
+ list_del(&module->ksm_module_list);
+ kmem_free(module, sizeof (kstat_module_t));
+}
+
+static int
+proc_kstat_open(struct inode *inode, struct file *filp)
+{
+ struct seq_file *f;
+ int rc;
+
+ rc = seq_open(filp, &kstat_seq_ops);
+ if (rc)
+ return (rc);
+
+ f = filp->private_data;
+ f->private = PDE_DATA(inode);
+
+ return (rc);
+}
+
+static ssize_t
+proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct seq_file *f = filp->private_data;
+ kstat_t *ksp = f->private;
+ int rc;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+ rc = ksp->ks_update(ksp, KSTAT_WRITE);
+ mutex_exit(ksp->ks_lock);
+
+ if (rc)
+ return (-rc);
+
+ *ppos += len;
+ return (len);
+}
+
+static struct file_operations proc_kstat_operations = {
+ .open = proc_kstat_open,
+ .write = proc_kstat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+EXPORT_SYMBOL(__kstat_set_raw_ops);
+
+kstat_t *
+__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
+ const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
+ uchar_t ks_flags)
+{
+ kstat_t *ksp;
+
+ ASSERT(ks_module);
+ ASSERT(ks_instance == 0);
+ ASSERT(ks_name);
+ ASSERT(!(ks_flags & KSTAT_FLAG_UNSUPPORTED));
+
+ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+ ASSERT(ks_ndata == 1);
+
+ ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
+ if (ksp == NULL)
+ return (ksp);
+
+ mutex_enter(&kstat_module_lock);
+ ksp->ks_kid = kstat_id;
+ kstat_id++;
+ mutex_exit(&kstat_module_lock);
+
+ ksp->ks_magic = KS_MAGIC;
+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+ ksp->ks_lock = &ksp->ks_private_lock;
+ INIT_LIST_HEAD(&ksp->ks_list);
+
+ ksp->ks_crtime = gethrtime();
+ ksp->ks_snaptime = ksp->ks_crtime;
+ strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN);
+ ksp->ks_instance = ks_instance;
+ strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN);
+ strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+ ksp->ks_type = ks_type;
+ ksp->ks_flags = ks_flags;
+ ksp->ks_update = kstat_default_update;
+ ksp->ks_private = NULL;
+ ksp->ks_raw_ops.headers = NULL;
+ ksp->ks_raw_ops.data = NULL;
+ ksp->ks_raw_ops.addr = NULL;
+ ksp->ks_raw_buf = NULL;
+ ksp->ks_raw_bufsize = 0;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ ksp->ks_ndata = 1;
+ ksp->ks_data_size = ks_ndata;
+ break;
+ case KSTAT_TYPE_NAMED:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ ksp->ks_data = NULL;
+ } else {
+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+ if (ksp->ks_data == NULL) {
+ kmem_free(ksp, sizeof (*ksp));
+ ksp = NULL;
+ }
+ }
+
+ return (ksp);
+}
+EXPORT_SYMBOL(__kstat_create);
+
+static int
+kstat_detect_collision(kstat_t *ksp)
+{
+ kstat_module_t *module;
+ kstat_t *tmp;
+ char *parent;
+ char *cp;
+
+ parent = kmem_asprintf("%s", ksp->ks_module);
+
+ if ((cp = strrchr(parent, '/')) == NULL) {
+ strfree(parent);
+ return (0);
+ }
+
+ cp[0] = '\0';
+ if ((module = kstat_find_module(parent)) != NULL) {
+ list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) {
+ if (strncmp(tmp->ks_name, cp+1, KSTAT_STRLEN) == 0) {
+ strfree(parent);
+ return (EEXIST);
+ }
+ }
+ }
+
+ strfree(parent);
+ return (0);
+}
+
+void
+__kstat_install(kstat_t *ksp)
+{
+ kstat_module_t *module;
+ kstat_t *tmp;
+
+ ASSERT(ksp);
+
+ mutex_enter(&kstat_module_lock);
+
+ module = kstat_find_module(ksp->ks_module);
+ if (module == NULL) {
+ if (kstat_detect_collision(ksp) != 0) {
+ cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
+ " collision", ksp->ks_module, ksp->ks_name);
+ goto out;
+ }
+ module = kstat_create_module(ksp->ks_module);
+ if (module == NULL)
+ goto out;
+ }
+
+ /*
+ * Only one entry by this name per-module, on failure the module
+ * shouldn't be deleted because we know it has at least one entry.
+ */
+ list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) {
+ if (strncmp(tmp->ks_name, ksp->ks_name, KSTAT_STRLEN) == 0)
+ goto out;
+ }
+
+ list_add_tail(&ksp->ks_list, &module->ksm_kstat_list);
+
+ mutex_enter(ksp->ks_lock);
+ ksp->ks_owner = module;
+ ksp->ks_proc = proc_create_data(ksp->ks_name, 0644,
+ module->ksm_proc, &proc_kstat_operations, (void *)ksp);
+ if (ksp->ks_proc == NULL) {
+ list_del_init(&ksp->ks_list);
+ if (list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ }
+ mutex_exit(ksp->ks_lock);
+out:
+ mutex_exit(&kstat_module_lock);
+}
+EXPORT_SYMBOL(__kstat_install);
+
+void
+__kstat_delete(kstat_t *ksp)
+{
+ kstat_module_t *module = ksp->ks_owner;
+
+ mutex_enter(&kstat_module_lock);
+ list_del_init(&ksp->ks_list);
+ mutex_exit(&kstat_module_lock);
+
+ if (ksp->ks_proc) {
+ remove_proc_entry(ksp->ks_name, module->ksm_proc);
+
+ /* Remove top level module directory if it's empty */
+ if (list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ }
+
+ if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+ kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+ ksp->ks_lock = NULL;
+ mutex_destroy(&ksp->ks_private_lock);
+ kmem_free(ksp, sizeof (*ksp));
+}
+EXPORT_SYMBOL(__kstat_delete);
+
+int
+spl_kstat_init(void)
+{
+ mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&kstat_module_list);
+ kstat_id = 0;
+ return (0);
+}
+
+void
+spl_kstat_fini(void)
+{
+ ASSERT(list_empty(&kstat_module_list));
+ mutex_destroy(&kstat_module_lock);
+}
diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c
new file mode 100644
index 000000000..ba818862b
--- /dev/null
+++ b/module/spl/spl-mutex.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Mutex Implementation.
+ */
+
+#include <sys/mutex.h>
+
+int spl_mutex_init(void) { return 0; }
+void spl_mutex_fini(void) { }
diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c
new file mode 100644
index 000000000..9c52924a4
--- /dev/null
+++ b/module/spl/spl-proc.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Proc Implementation.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/proc.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+
+#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+typedef struct ctl_table __no_const spl_ctl_table;
+#else
+typedef struct ctl_table spl_ctl_table;
+#endif
+
+static unsigned long table_min = 0;
+static unsigned long table_max = ~0;
+
+static struct ctl_table_header *spl_header = NULL;
+static struct proc_dir_entry *proc_spl = NULL;
+static struct proc_dir_entry *proc_spl_kmem = NULL;
+static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
+static struct proc_dir_entry *proc_spl_taskq_all = NULL;
+static struct proc_dir_entry *proc_spl_taskq = NULL;
+struct proc_dir_entry *proc_spl_kstat = NULL;
+
+static int
+proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer,
+ int ubuffer_size)
+{
+ int size;
+
+ if (ubuffer_size > kbuffer_size)
+ return (-EOVERFLOW);
+
+ if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size))
+ return (-EFAULT);
+
+ /* strip trailing whitespace */
+ size = strnlen(kbuffer, ubuffer_size);
+ while (size-- >= 0)
+ if (!isspace(kbuffer[size]))
+ break;
+
+ /* empty string */
+ if (size < 0)
+ return (-EINVAL);
+
+ /* no space to terminate */
+ if (size == kbuffer_size)
+ return (-EOVERFLOW);
+
+ kbuffer[size + 1] = 0;
+ return (0);
+}
+
+static int
+proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer,
+ char *append)
+{
+ /*
+ * NB if 'append' != NULL, it's a single character to append to the
+ * copied out string - usually "\n", for /proc entries and
+ * (i.e. a terminating zero byte) for sysctl entries
+ */
+ int size = MIN(strlen(kbuffer), ubuffer_size);
+
+ if (copy_to_user(ubuffer, kbuffer, size))
+ return (-EFAULT);
+
+ if (append != NULL && size < ubuffer_size) {
+ if (copy_to_user(ubuffer + size, append, 1))
+ return (-EFAULT);
+
+ size++;
+ }
+
+ return (size);
+}
+
+#ifdef DEBUG_KMEM
+static int
+proc_domemused(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val;
+ spl_ctl_table dummy = *table;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+#ifdef HAVE_ATOMIC64_T
+ val = atomic64_read((atomic64_t *)table->data);
+#else
+ val = atomic_read((atomic_t *)table->data);
+#endif /* HAVE_ATOMIC64_T */
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+#endif /* DEBUG_KMEM */
+
+static int
+proc_doslab(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val = 0, mask;
+ spl_ctl_table dummy = *table;
+ spl_kmem_cache_t *skc;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+ down_read(&spl_kmem_cache_sem);
+ mask = (unsigned long)table->data;
+
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+
+ /* Only use slabs of the correct kmem/vmem type */
+ if (!(skc->skc_flags & mask))
+ continue;
+
+ /* Sum the specified field for selected slabs */
+ switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
+ case KMC_TOTAL:
+ val += skc->skc_slab_size * skc->skc_slab_total;
+ break;
+ case KMC_ALLOC:
+ val += skc->skc_obj_size * skc->skc_obj_alloc;
+ break;
+ case KMC_MAX:
+ val += skc->skc_obj_size * skc->skc_obj_max;
+ break;
+ }
+ }
+
+ up_read(&spl_kmem_cache_sem);
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+
+static int
+proc_dohostid(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int len, rc = 0;
+ char *end, str[32];
+
+ if (write) {
+ /*
+ * We can't use proc_doulongvec_minmax() in the write
+ * case here because hostid while a hex value has no
+ * leading 0x which confuses the helper function.
+ */
+ rc = proc_copyin_string(str, sizeof (str), buffer, *lenp);
+ if (rc < 0)
+ return (rc);
+
+ spl_hostid = simple_strtoul(str, &end, 16);
+ if (str == end)
+ return (-EINVAL);
+
+ } else {
+ len = snprintf(str, sizeof (str), "%lx",
+ (unsigned long) zone_get_hostid(NULL));
+ if (*ppos >= len)
+ rc = 0;
+ else
+ rc = proc_copyout_string(buffer,
+ *lenp, str + *ppos, "\n");
+
+ if (rc >= 0) {
+ *lenp = rc;
+ *ppos += rc;
+ }
+ }
+
+ return (rc);
+}
+
+static void
+taskq_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
+ "taskq", "act", "nthr", "spwn", "maxt", "pri",
+ "mina", "maxa", "cura", "flags");
+}
+
+/* indices into the lheads array below */
+#define LHEAD_PEND 0
+#define LHEAD_PRIO 1
+#define LHEAD_DELAY 2
+#define LHEAD_WAIT 3
+#define LHEAD_ACTIVE 4
+#define LHEAD_SIZE 5
+
+/* BEGIN CSTYLED */
+static unsigned int spl_max_show_tasks = 512;
+module_param(spl_max_show_tasks, uint, 0644);
+MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
+/* END CSTYLED */
+
+static int
+taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
+{
+ taskq_t *tq = p;
+ taskq_thread_t *tqt;
+ spl_wait_queue_entry_t *wq;
+ struct task_struct *tsk;
+ taskq_ent_t *tqe;
+ char name[100];
+ struct list_head *lheads[LHEAD_SIZE], *lh;
+ static char *list_names[LHEAD_SIZE] =
+ {"pend", "prio", "delay", "wait", "active" };
+ int i, j, have_lheads = 0;
+ unsigned long wflags, flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
+
+ /* get the various lists and check whether they're empty */
+ lheads[LHEAD_PEND] = &tq->tq_pend_list;
+ lheads[LHEAD_PRIO] = &tq->tq_prio_list;
+ lheads[LHEAD_DELAY] = &tq->tq_delay_list;
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
+#else
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
+#endif
+ lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
+
+ for (i = 0; i < LHEAD_SIZE; ++i) {
+ if (list_empty(lheads[i]))
+ lheads[i] = NULL;
+ else
+ ++have_lheads;
+ }
+
+ /* early return in non-"all" mode if lists are all empty */
+ if (!allflag && !have_lheads) {
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (0);
+ }
+
+ /* unlock the waitq quickly */
+ if (!lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+
+ /* show the base taskq contents */
+ snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
+ seq_printf(f, "%-25s ", name);
+ seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
+ tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
+ tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
+ tq->tq_nalloc, tq->tq_flags);
+
+ /* show the active list */
+ if (lheads[LHEAD_ACTIVE]) {
+ j = 0;
+ list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[LHEAD_ACTIVE]);
+ else if (j == 2) {
+ seq_printf(f, "\n\t ");
+ j = 0;
+ }
+ seq_printf(f, " [%d]%pf(%ps)",
+ tqt->tqt_thread->pid,
+ tqt->tqt_task->tqent_func,
+ tqt->tqt_task->tqent_arg);
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+
+ for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
+ if (lheads[i]) {
+ j = 0;
+ list_for_each(lh, lheads[i]) {
+ if (spl_max_show_tasks != 0 &&
+ j >= spl_max_show_tasks) {
+ seq_printf(f, "\n\t(truncated)");
+ break;
+ }
+ /* show the wait waitq list */
+ if (i == LHEAD_WAIT) {
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, entry);
+#else
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, task_list);
+#endif
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 8 == 0)
+ seq_printf(f, "\n\t ");
+
+ tsk = wq->private;
+ seq_printf(f, " %d", tsk->pid);
+ /* pend, prio and delay lists */
+ } else {
+ tqe = list_entry(lh, taskq_ent_t,
+ tqent_list);
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 2 == 0)
+ seq_printf(f, "\n\t ");
+
+ seq_printf(f, " %pf(%ps)",
+ tqe->tqent_func,
+ tqe->tqent_arg);
+ }
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+ if (lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (0);
+}
+
+static int
+taskq_all_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_TRUE));
+}
+
+static int
+taskq_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_FALSE));
+}
+
+static void *
+taskq_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&tq_list_sem);
+ if (!n)
+ taskq_seq_show_headers(f);
+
+ p = tq_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &tq_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, taskq_t, tq_taskqs));
+}
+
+static void *
+taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ taskq_t *tq = p;
+
+ ++*pos;
+ return ((tq->tq_taskqs.next == &tq_list) ?
+ NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
+}
+
+static void
+slab_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f,
+ "--------------------- cache ----------"
+ "--------------------------------------------- "
+ "----- slab ------ "
+ "---- object ----- "
+ "--- emergency ---\n");
+ seq_printf(f,
+ "name "
+ " flags size alloc slabsize objsize "
+ "total alloc max "
+ "total alloc max "
+ "dlock alloc max\n");
+}
+
+static int
+slab_seq_show(struct seq_file *f, void *p)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Backed by Linux slab see /proc/slabinfo.
+ */
+ if (skc->skc_flags & KMC_SLAB)
+ return (0);
+
+ spin_lock(&skc->skc_lock);
+ seq_printf(f, "%-36s ", skc->skc_name);
+ seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
+ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
+ (long unsigned)skc->skc_flags,
+ (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
+ (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
+ (unsigned)skc->skc_slab_size,
+ (unsigned)skc->skc_obj_size,
+ (long unsigned)skc->skc_slab_total,
+ (long unsigned)skc->skc_slab_alloc,
+ (long unsigned)skc->skc_slab_max,
+ (long unsigned)skc->skc_obj_total,
+ (long unsigned)skc->skc_obj_alloc,
+ (long unsigned)skc->skc_obj_max,
+ (long unsigned)skc->skc_obj_deadlock,
+ (long unsigned)skc->skc_obj_emergency,
+ (long unsigned)skc->skc_obj_emergency_max);
+
+ spin_unlock(&skc->skc_lock);
+
+ return (0);
+}
+
+static void *
+slab_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&spl_kmem_cache_sem);
+ if (!n)
+ slab_seq_show_headers(f);
+
+ p = spl_kmem_cache_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &spl_kmem_cache_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, spl_kmem_cache_t, skc_list));
+}
+
+static void *
+slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ++*pos;
+ return ((skc->skc_list.next == &spl_kmem_cache_list) ?
+ NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
+}
+
+static void
+slab_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&spl_kmem_cache_sem);
+}
+
+static struct seq_operations slab_seq_ops = {
+ .show = slab_seq_show,
+ .start = slab_seq_start,
+ .next = slab_seq_next,
+ .stop = slab_seq_stop,
+};
+
+static int
+proc_slab_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &slab_seq_ops));
+}
+
+static struct file_operations proc_slab_operations = {
+ .open = proc_slab_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void
+taskq_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&tq_list_sem);
+}
+
+static struct seq_operations taskq_all_seq_ops = {
+ .show = taskq_all_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static struct seq_operations taskq_seq_ops = {
+ .show = taskq_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static int
+proc_taskq_all_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_all_seq_ops));
+}
+
+static int
+proc_taskq_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_seq_ops));
+}
+
+static struct file_operations proc_taskq_all_operations = {
+ .open = proc_taskq_all_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct file_operations proc_taskq_operations = {
+ .open = proc_taskq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
+ {
+ .procname = "kmem_used",
+ .data = &kmem_alloc_used,
+#ifdef HAVE_ATOMIC64_T
+ .maxlen = sizeof (atomic64_t),
+#else
+ .maxlen = sizeof (atomic_t),
+#endif /* HAVE_ATOMIC64_T */
+ .mode = 0444,
+ .proc_handler = &proc_domemused,
+ },
+ {
+ .procname = "kmem_max",
+ .data = &kmem_alloc_max,
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif /* DEBUG_KMEM */
+ {
+ .procname = "slab_kmem_total",
+ .data = (void *)(KMC_KMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kmem_alloc",
+ .data = (void *)(KMC_KMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kmem_max",
+ .data = (void *)(KMC_KMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_total",
+ .data = (void *)(KMC_VMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_alloc",
+ .data = (void *)(KMC_VMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_max",
+ .data = (void *)(KMC_VMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {},
+};
+
+static struct ctl_table spl_kstat_table[] = {
+ {},
+};
+
+static struct ctl_table spl_table[] = {
+ /*
+ * NB No .strategy entries have been provided since
+ * sysctl(8) prefers to go via /proc for portability.
+ */
+ {
+ .procname = "version",
+ .data = spl_version,
+ .maxlen = sizeof (spl_version),
+ .mode = 0444,
+ .proc_handler = &proc_dostring,
+ },
+ {
+ .procname = "hostid",
+ .data = &spl_hostid,
+ .maxlen = sizeof (unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_dohostid,
+ },
+ {
+ .procname = "kmem",
+ .mode = 0555,
+ .child = spl_kmem_table,
+ },
+ {
+ .procname = "kstat",
+ .mode = 0555,
+ .child = spl_kstat_table,
+ },
+ {},
+};
+
+static struct ctl_table spl_dir[] = {
+ {
+ .procname = "spl",
+ .mode = 0555,
+ .child = spl_table,
+ },
+ {}
+};
+
+static struct ctl_table spl_root[] = {
+ {
+#ifdef HAVE_CTL_NAME
+ .ctl_name = CTL_KERN,
+#endif
+ .procname = "kernel",
+ .mode = 0555,
+ .child = spl_dir,
+ },
+ {}
+};
+
+int
+spl_proc_init(void)
+{
+ int rc = 0;
+
+ spl_header = register_sysctl_table(spl_root);
+ if (spl_header == NULL)
+ return (-EUNATCH);
+
+ proc_spl = proc_mkdir("spl", NULL);
+ if (proc_spl == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
+ &proc_taskq_all_operations, NULL);
+ if (proc_spl_taskq_all == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
+ &proc_taskq_operations, NULL);
+ if (proc_spl_taskq == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem = proc_mkdir("kmem", proc_spl);
+ if (proc_spl_kmem == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
+ &proc_slab_operations, NULL);
+ if (proc_spl_kmem_slab == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kstat = proc_mkdir("kstat", proc_spl);
+ if (proc_spl_kstat == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+out:
+ if (rc) {
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+ unregister_sysctl_table(spl_header);
+ }
+
+ return (rc);
+}
+
+void
+spl_proc_fini(void)
+{
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+
+ ASSERT(spl_header != NULL);
+ unregister_sysctl_table(spl_header);
+}
diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c
new file mode 100644
index 000000000..9a992cc3a
--- /dev/null
+++ b/module/spl/spl-rwlock.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation.
+ */
+
+#include <sys/rwlock.h>
+
+#if defined(CONFIG_PREEMPT_RT_FULL)
+
+#include <linux/rtmutex.h>
+#define RT_MUTEX_OWNER_MASKALL 1UL
+
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+
+ ASSERT((struct task_struct *)
+ ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) ==
+ current);
+
+ /*
+ * Under the realtime patch series, rwsem is implemented as a
+ * single mutex held by readers and writers alike. However,
+ * this implementation would prevent a thread from taking a
+ * read lock twice, as the mutex would already be locked on
+ * the second attempt. Therefore the implementation allows a
+ * single thread to take a rwsem as read lock multiple times
+ * tracking that nesting as read_depth counter.
+ */
+ if (rwsem->read_depth <= 1) {
+ /*
+ * In case, the current thread has not taken the lock
+ * more than once as read lock, we can allow an
+ * upgrade to a write lock. rwsem_rt.h implements
+ * write locks as read_depth == 0.
+ */
+ rwsem->read_depth = 0;
+ return (1);
+ }
+ return (0);
+}
+#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ int ret = 0;
+ unsigned long flags;
+ spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags);
+ if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE &&
+ list_empty(&rwsem->wait_list)) {
+ ret = 1;
+ RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE;
+ }
+ spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags);
+ return (ret);
+}
+#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT)
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ long val;
+ val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
+ SPL_RWSEM_SINGLE_WRITER_VALUE);
+ return (val == SPL_RWSEM_SINGLE_READER_VALUE);
+}
+#else
+static int
+__rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ typeof(rwsem->count) val;
+ val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE,
+ SPL_RWSEM_SINGLE_WRITER_VALUE);
+ return (val == SPL_RWSEM_SINGLE_READER_VALUE);
+}
+#endif
+
+int
+rwsem_tryupgrade(struct rw_semaphore *rwsem)
+{
+ if (__rwsem_tryupgrade(rwsem)) {
+ rwsem_release(&rwsem->dep_map, 1, _RET_IP_);
+ rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_);
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+ rwsem->owner = current;
+#endif
+ return (1);
+ }
+ return (0);
+}
+EXPORT_SYMBOL(rwsem_tryupgrade);
+
+int spl_rw_init(void) { return 0; }
+void spl_rw_fini(void) { }
diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
new file mode 100644
index 000000000..2919a942a
--- /dev/null
+++ b/module/spl/spl-taskq.c
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Task Queue Implementation.
+ */
+
+#include <sys/taskq.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+int spl_taskq_thread_bind = 0;
+module_param(spl_taskq_thread_bind, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+
+
+int spl_taskq_thread_dynamic = 1;
+module_param(spl_taskq_thread_dynamic, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+ "Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+ "Create new taskq threads after N sequential tasks");
+
+/* Global system-wide dynamic task queue available for all consumers */
+taskq_t *system_taskq;
+EXPORT_SYMBOL(system_taskq);
+/* Global dynamic task queue for long delay */
+taskq_t *system_delay_taskq;
+EXPORT_SYMBOL(system_delay_taskq);
+
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
+/* List of all taskqs */
+LIST_HEAD(tq_list);
+DECLARE_RWSEM(tq_list_sem);
+static uint_t taskq_tsd;
+
+static int
+task_km_flags(uint_t flags)
+{
+ if (flags & TQ_NOSLEEP)
+ return (KM_NOSLEEP);
+
+ if (flags & TQ_PUSHPAGE)
+ return (KM_PUSHPAGE);
+
+ return (KM_SLEEP);
+}
+
+/*
+ * taskq_find_by_name - Find the largest instance number of a named taskq.
+ */
+static int
+taskq_find_by_name(const char *name)
+{
+ struct list_head *tql;
+ taskq_t *tq;
+
+ list_for_each_prev(tql, &tq_list) {
+ tq = list_entry(tql, taskq_t, tq_taskqs);
+ if (strcmp(name, tq->tq_name) == 0)
+ return (tq->tq_instance);
+ }
+ return (-1);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, returns a list_t which
+ * is not attached to the free, work, or pending taskq lists.
+ */
+static taskq_ent_t *
+task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
+{
+ taskq_ent_t *t;
+ int count = 0;
+
+ ASSERT(tq);
+retry:
+ /* Acquire taskq_ent_t's from free list if available */
+ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ list_del_init(&t->tqent_list);
+ return (t);
+ }
+
+ /* Free list is empty and memory allocations are prohibited */
+ if (flags & TQ_NOALLOC)
+ return (NULL);
+
+ /* Hit maximum taskq_ent_t pool size */
+ if (tq->tq_nalloc >= tq->tq_maxalloc) {
+ if (flags & TQ_NOSLEEP)
+ return (NULL);
+
+ /*
+ * Sleep periodically polling the free list for an available
+ * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+ * but we cannot block forever waiting for an taskq_ent_t to
+ * show up in the free list, otherwise a deadlock can happen.
+ *
+ * Therefore, we need to allocate a new task even if the number
+ * of allocated tasks is above tq->tq_maxalloc, but we still
+ * end up delaying the task allocation by one second, thereby
+ * throttling the task dispatch rate.
+ */
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ schedule_timeout(HZ / 100);
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
+ tq->tq_lock_class);
+ if (count < 100) {
+ count++;
+ goto retry;
+ }
+ }
+
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
+
+ if (t) {
+ taskq_init_ent(t);
+ tq->tq_nalloc++;
+ }
+
+ return (t);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
+ * to already be removed from the free, work, or pending taskq lists.
+ */
+static void
+task_free(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+ ASSERT(list_empty(&t->tqent_list));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ kmem_free(t, sizeof (taskq_ent_t));
+ tq->tq_nalloc--;
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, either destroys the
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
+ */
+static void
+task_done(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+
+ /* Wake tasks blocked in taskq_wait_id() */
+ wake_up_all(&t->tqent_waitq);
+
+ list_del_init(&t->tqent_list);
+
+ if (tq->tq_nalloc <= tq->tq_minalloc) {
+ t->tqent_id = TASKQID_INVALID;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+
+ list_add_tail(&t->tqent_list, &tq->tq_free_list);
+ } else {
+ task_free(tq, t);
+ }
+}
+
+/*
+ * When a delayed task timer expires remove it from the delay list and
+ * add it to the priority list in order for immediate processing.
+ */
+static void
+task_expire_impl(taskq_ent_t *t)
+{
+ taskq_ent_t *w;
+ taskq_t *tq = t->tqent_taskq;
+ struct list_head *l;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (t->tqent_flags & TQENT_FLAG_CANCEL) {
+ ASSERT(list_empty(&t->tqent_list));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return;
+ }
+
+ t->tqent_birth = jiffies;
+ /*
+ * The priority list must be maintained in strict task id order
+ * from lowest to highest for lowest_id to be easily calculable.
+ */
+ list_del(&t->tqent_list);
+ list_for_each_prev(l, &tq->tq_prio_list) {
+ w = list_entry(l, taskq_ent_t, tqent_list);
+ if (w->tqent_id < t->tqent_id) {
+ list_add(&t->tqent_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_prio_list)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ wake_up(&tq->tq_work_waitq);
+}
+
+#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+static void
+task_expire(struct timer_list *tl)
+{
+ taskq_ent_t *t = from_timer(t, tl, tqent_timer);
+ task_expire_impl(t);
+}
+#else
+static void
+task_expire(unsigned long data)
+{
+ task_expire_impl((taskq_ent_t *)data);
+}
+#endif
+
+/*
+ * Returns the lowest incomplete taskqid_t. The taskqid_t may
+ * be queued on the pending list, on the priority list, on the
+ * delay list, or on the work list currently being handled, but
+ * it is not 100% complete yet.
+ */
+static taskqid_t
+taskq_lowest_id(taskq_t *tq)
+{
+ taskqid_t lowest_id = tq->tq_next_id;
+ taskq_ent_t *t;
+ taskq_thread_t *tqt;
+
+ ASSERT(tq);
+
+ if (!list_empty(&tq->tq_pend_list)) {
+ t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_prio_list)) {
+ t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_delay_list)) {
+ t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_active_list)) {
+ tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+ tqt_active_list);
+ ASSERT(tqt->tqt_id != TASKQID_INVALID);
+ lowest_id = MIN(lowest_id, tqt->tqt_id);
+ }
+
+ return (lowest_id);
+}
+
+/*
+ * Insert a task into a list keeping the list sorted by increasing taskqid.
+ */
+static void
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
+{
+ taskq_thread_t *w;
+ struct list_head *l;
+
+ ASSERT(tq);
+ ASSERT(tqt);
+
+ list_for_each_prev(l, &tq->tq_active_list) {
+ w = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (w->tqt_id < tqt->tqt_id) {
+ list_add(&tqt->tqt_active_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_active_list)
+ list_add(&tqt->tqt_active_list, &tq->tq_active_list);
+}
+
+/*
+ * Find and return a task from the given list if it exists. The list
+ * must be in lowest to highest task id order.
+ */
+static taskq_ent_t *
+taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
+{
+ struct list_head *l;
+ taskq_ent_t *t;
+
+ list_for_each(l, lh) {
+ t = list_entry(l, taskq_ent_t, tqent_list);
+
+ if (t->tqent_id == id)
+ return (t);
+
+ if (t->tqent_id > id)
+ break;
+ }
+
+ return (NULL);
+}
+
+/*
+ * Find an already dispatched task given the task id regardless of what
+ * state it is in. If a task is still pending it will be returned.
+ * If a task is executing, then -EBUSY will be returned instead.
+ * If the task has already been run then NULL is returned.
+ */
+static taskq_ent_t *
+taskq_find(taskq_t *tq, taskqid_t id)
+{
+ taskq_thread_t *tqt;
+ struct list_head *l;
+ taskq_ent_t *t;
+
+ t = taskq_find_list(tq, &tq->tq_delay_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_prio_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_pend_list, id);
+ if (t)
+ return (t);
+
+ list_for_each(l, &tq->tq_active_list) {
+ tqt = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (tqt->tqt_id == id) {
+ /*
+ * Instead of returning tqt_task, we just return a non
+ * NULL value to prevent misuse, since tqt_task only
+ * has two valid fields.
+ */
+ return (ERR_PTR(-EBUSY));
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id. As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists. As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads. This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists. This value is stored
+ * with the taskq as the lowest id. It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented. Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones. Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
+static int
+taskq_wait_id_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (taskq_find(tq, id) == NULL);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_id() function blocks until the passed task id completes.
+ * This does not guarantee that all lower task ids have completed.
+ */
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_id);
+
+static int
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (id < tq->tq_lowest_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed. Note that all
+ * task id's are assigned monotonically at dispatch time. Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
+{
+ id = id ? id : tq->tq_next_id - 1;
+ wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_outstanding);
+
+static int
+taskq_wait_check(taskq_t *tq)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (tq->tq_lowest_id == tq->tq_next_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
+}
+EXPORT_SYMBOL(taskq_wait);
+
+int
+taskq_member(taskq_t *tq, kthread_t *t)
+{
+ return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
+}
+EXPORT_SYMBOL(taskq_member);
+
+/*
+ * Cancel an already dispatched task given the task id. Still pending tasks
+ * will be immediately canceled, and if the task is active the function will
+ * block until it completes. Preallocated tasks which are canceled must be
+ * freed by the caller.
+ */
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_ent_t *t;
+ int rc = ENOENT;
+ unsigned long flags;
+
+ ASSERT(tq);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ t = taskq_find(tq, id);
+ if (t && t != ERR_PTR(-EBUSY)) {
+ list_del_init(&t->tqent_list);
+ t->tqent_flags |= TQENT_FLAG_CANCEL;
+
+ /*
+ * When canceling the lowest outstanding task id we
+ * must recalculate the new lowest outstanding id.
+ */
+ if (tq->tq_lowest_id == t->tqent_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
+ }
+
+ /*
+ * The task_expire() function takes the tq->tq_lock so drop
+ * drop the lock before synchronously cancelling the timer.
+ */
+ if (timer_pending(&t->tqent_timer)) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ del_timer_sync(&t->tqent_timer);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ rc = 0;
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ if (t == ERR_PTR(-EBUSY)) {
+ taskq_wait_id(tq, id);
+ rc = EBUSY;
+ }
+
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_cancel_id);
+
+static int taskq_thread_spawn(taskq_t *tq);
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *t;
+ taskqid_t rc = TASKQID_INVALID;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ /* Do not queue the task unless there is idle thread for it */
+ ASSERT(tq->tq_nactive <= tq->tq_nthreads);
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out;
+ }
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
+ if (flags & TQ_NOQUEUE)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+ /* Queue to the priority list instead of the pending list */
+ else if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+ t->tqent_timer.data = 0;
+#endif
+ t->tqent_timer.function = NULL;
+ t->tqent_timer.expires = 0;
+ t->tqent_birth = jiffies;
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch);
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+ uint_t flags, clock_t expire_time)
+{
+ taskqid_t rc = TASKQID_INVALID;
+ taskq_ent_t *t;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the delay list for subsequent execution */
+ list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+ t->tqent_timer.data = (unsigned long)t;
+#endif
+ t->tqent_timer.function = task_expire;
+ t->tqent_timer.expires = (unsigned long)expire_time;
+ add_timer(&t->tqent_timer);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch_delay);
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+ taskq_ent_t *t)
+{
+ unsigned long irqflags;
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE)) {
+ t->tqent_id = TASKQID_INVALID;
+ goto out;
+ }
+
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out2;
+ flags |= TQ_FRONT;
+ }
+
+ spin_lock(&t->tqent_lock);
+
+ /*
+ * Make sure the entry is not on some other taskq; it is important to
+ * ASSERT() under lock
+ */
+ ASSERT(taskq_empty_ent(t));
+
+ /*
+ * Mark it as a prealloc'd task. This is important
+ * to ensure that we don't free it later.
+ */
+ t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+ /* Queue to the priority list instead of the pending list */
+ if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_birth = jiffies;
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+out2:
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+}
+EXPORT_SYMBOL(taskq_dispatch_ent);
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+ return (list_empty(&t->tqent_list));
+}
+EXPORT_SYMBOL(taskq_empty_ent);
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+ spin_lock_init(&t->tqent_lock);
+ init_waitqueue_head(&t->tqent_waitq);
+#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST
+ timer_setup(&t->tqent_timer, NULL, 0);
+#else
+ init_timer(&t->tqent_timer);
+#endif
+ INIT_LIST_HEAD(&t->tqent_list);
+ t->tqent_id = 0;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+ t->tqent_taskq = NULL;
+}
+EXPORT_SYMBOL(taskq_init_ent);
+
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+ struct list_head *list;
+
+ if (!list_empty(&tq->tq_prio_list))
+ list = &tq->tq_prio_list;
+ else if (!list_empty(&tq->tq_pend_list))
+ list = &tq->tq_pend_list;
+ else
+ return (NULL);
+
+ return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+ taskq_t *tq = (taskq_t *)arg;
+ unsigned long flags;
+
+ if (taskq_thread_create(tq) == NULL) {
+ /* restore spawning count if failed */
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nspawn--;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
+ * number of threads is insufficient to handle the pending tasks. These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim. The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq)
+{
+ int spawning = 0;
+
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+ (tq->tq_flags & TASKQ_ACTIVE)) {
+ spawning = (++tq->tq_nspawn);
+ taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+ tq, TQ_NOSLEEP);
+ }
+
+ return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks. This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+ tqt_thread_list) == tqt)
+ return (0);
+
+ return
+ ((tq->tq_nspawn == 0) && /* No threads are being spawned */
+ (tq->tq_nactive == 0) && /* No threads are handling tasks */
+ (tq->tq_nthreads > 1) && /* More than 1 thread is running */
+ (!taskq_next_ent(tq)) && /* There are no pending tasks */
+ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+}
+
+static int
+taskq_thread(void *args)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ sigset_t blocked;
+ taskq_thread_t *tqt = args;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ int seq_tasks = 0;
+ unsigned long flags;
+ taskq_ent_t dup_task = {};
+
+ ASSERT(tqt);
+ ASSERT(tqt->tqt_tq);
+ tq = tqt->tqt_tq;
+ current->flags |= PF_NOFREEZE;
+
+ (void) spl_fstrans_mark();
+
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+
+ tsd_set(taskq_tsd, tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /*
+ * If we are dynamically spawned, decrease spawning count. Note that
+ * we could be created during taskq_create, in which case we shouldn't
+ * do the decrement. But it's fine because taskq_create will reset
+ * tq_nspawn later.
+ */
+ if (tq->tq_flags & TASKQ_DYNAMIC)
+ tq->tq_nspawn--;
+
+ /* Immediately exit if more threads than allowed were created. */
+ if (tq->tq_nthreads >= tq->tq_maxthreads)
+ goto error;
+
+ tq->tq_nthreads++;
+ list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
+ wake_up(&tq->tq_wait_waitq);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+
+ if (list_empty(&tq->tq_pend_list) &&
+ list_empty(&tq->tq_prio_list)) {
+
+ if (taskq_thread_should_stop(tq, tqt)) {
+ wake_up_all(&tq->tq_wait_waitq);
+ break;
+ }
+
+ add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ schedule();
+ seq_tasks = 0;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ remove_wait_queue(&tq->tq_work_waitq, &wait);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if ((t = taskq_next_ent(tq)) != NULL) {
+ list_del_init(&t->tqent_list);
+
+ /*
+ * A TQENT_FLAG_PREALLOC task may be reused or freed
+ * during the task function call. Store tqent_id and
+ * tqent_flags here.
+ *
+ * Also use an on stack taskq_ent_t for tqt_task
+ * assignment in this case. We only populate the two
+ * fields used by the only user in taskq proc file.
+ */
+ tqt->tqt_id = t->tqent_id;
+ tqt->tqt_flags = t->tqent_flags;
+
+ if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
+ dup_task.tqent_func = t->tqent_func;
+ dup_task.tqent_arg = t->tqent_arg;
+ t = &dup_task;
+ }
+ tqt->tqt_task = t;
+
+ taskq_insert_in_order(tq, tqt);
+ tq->tq_nactive++;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ /* Perform the requested task */
+ t->tqent_func(t->tqent_arg);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nactive--;
+ list_del_init(&tqt->tqt_active_list);
+ tqt->tqt_task = NULL;
+
+ /* For prealloc'd tasks, we don't free anything. */
+ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ /*
+ * When the current lowest outstanding taskqid is
+ * done calculate the new lowest outstanding id
+ */
+ if (tq->tq_lowest_id == tqt->tqt_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
+ }
+
+ /* Spawn additional taskq threads if required. */
+ if ((++seq_tasks) > spl_taskq_thread_sequential &&
+ taskq_thread_spawn(tq))
+ seq_tasks = 0;
+
+ tqt->tqt_id = TASKQID_INVALID;
+ tqt->tqt_flags = 0;
+ wake_up_all(&tq->tq_wait_waitq);
+ } else {
+ if (taskq_thread_should_stop(tq, tqt))
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ }
+
+ __set_current_state(TASK_RUNNING);
+ tq->tq_nthreads--;
+ list_del_init(&tqt->tqt_thread_list);
+error:
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ tsd_set(taskq_tsd, NULL);
+
+ return (0);
+}
+
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+ static int last_used_cpu = 0;
+ taskq_thread_t *tqt;
+
+ tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+ INIT_LIST_HEAD(&tqt->tqt_thread_list);
+ INIT_LIST_HEAD(&tqt->tqt_active_list);
+ tqt->tqt_tq = tq;
+ tqt->tqt_id = TASKQID_INVALID;
+
+ tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+ "%s", tq->tq_name);
+ if (tqt->tqt_thread == NULL) {
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ return (NULL);
+ }
+
+ if (spl_taskq_thread_bind) {
+ last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+ kthread_bind(tqt->tqt_thread, last_used_cpu);
+ }
+
+ if (spl_taskq_thread_priority)
+ set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+ wake_up_process(tqt->tqt_thread);
+
+ return (tqt);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags)
+{
+ taskq_t *tq;
+ taskq_thread_t *tqt;
+ int count = 0, rc = 0, i;
+ unsigned long irqflags;
+
+ ASSERT(name != NULL);
+ ASSERT(minalloc >= 0);
+ ASSERT(maxalloc <= INT_MAX);
+ ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
+
+ /* Scale the number of threads using nthreads as a percentage */
+ if (flags & TASKQ_THREADS_CPU_PCT) {
+ ASSERT(nthreads <= 100);
+ ASSERT(nthreads >= 0);
+ nthreads = MIN(nthreads, 100);
+ nthreads = MAX(nthreads, 0);
+ nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
+ }
+
+ tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
+ if (tq == NULL)
+ return (NULL);
+
+ spin_lock_init(&tq->tq_lock);
+ INIT_LIST_HEAD(&tq->tq_thread_list);
+ INIT_LIST_HEAD(&tq->tq_active_list);
+ tq->tq_name = strdup(name);
+ tq->tq_nactive = 0;
+ tq->tq_nthreads = 0;
+ tq->tq_nspawn = 0;
+ tq->tq_maxthreads = nthreads;
+ tq->tq_pri = pri;
+ tq->tq_minalloc = minalloc;
+ tq->tq_maxalloc = maxalloc;
+ tq->tq_nalloc = 0;
+ tq->tq_flags = (flags | TASKQ_ACTIVE);
+ tq->tq_next_id = TASKQID_INITIAL;
+ tq->tq_lowest_id = TASKQID_INITIAL;
+ INIT_LIST_HEAD(&tq->tq_free_list);
+ INIT_LIST_HEAD(&tq->tq_pend_list);
+ INIT_LIST_HEAD(&tq->tq_prio_list);
+ INIT_LIST_HEAD(&tq->tq_delay_list);
+ init_waitqueue_head(&tq->tq_work_waitq);
+ init_waitqueue_head(&tq->tq_wait_waitq);
+ tq->tq_lock_class = TQ_LOCK_GENERAL;
+ INIT_LIST_HEAD(&tq->tq_taskqs);
+
+ if (flags & TASKQ_PREPOPULATE) {
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ for (i = 0; i < minalloc; i++)
+ task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
+ &irqflags));
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ }
+
+ if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+ nthreads = 1;
+
+ for (i = 0; i < nthreads; i++) {
+ tqt = taskq_thread_create(tq);
+ if (tqt == NULL)
+ rc = 1;
+ else
+ count++;
+ }
+
+ /* Wait for all threads to be started before potential destroy */
+ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
+ /*
+ * taskq_thread might have touched nspawn, but we don't want them to
+ * because they're not dynamically spawned. So we reset it to 0
+ */
+ tq->tq_nspawn = 0;
+
+ if (rc) {
+ taskq_destroy(tq);
+ tq = NULL;
+ } else {
+ down_write(&tq_list_sem);
+ tq->tq_instance = taskq_find_by_name(name) + 1;
+ list_add_tail(&tq->tq_taskqs, &tq_list);
+ up_write(&tq_list_sem);
+ }
+
+ return (tq);
+}
+EXPORT_SYMBOL(taskq_create);
+
+void
+taskq_destroy(taskq_t *tq)
+{
+ struct task_struct *thread;
+ taskq_thread_t *tqt;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ tq->tq_flags &= ~TASKQ_ACTIVE;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ /*
+ * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+ * new worker threads be spawned for dynamic taskq.
+ */
+ if (dynamic_taskq != NULL)
+ taskq_wait_outstanding(dynamic_taskq, 0);
+
+ taskq_wait(tq);
+
+ /* remove taskq from global list used by the kstats */
+ down_write(&tq_list_sem);
+ list_del(&tq->tq_taskqs);
+ up_write(&tq_list_sem);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /* wait for spawning threads to insert themselves to the list */
+ while (tq->tq_nspawn) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ schedule_timeout_interruptible(1);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ /*
+ * Signal each thread to exit and block until it does. Each thread
+ * is responsible for removing itself from the list and freeing its
+ * taskq_thread_t. This allows for idle threads to opt to remove
+ * themselves from the taskq. They can be recreated as needed.
+ */
+ while (!list_empty(&tq->tq_thread_list)) {
+ tqt = list_entry(tq->tq_thread_list.next,
+ taskq_thread_t, tqt_thread_list);
+ thread = tqt->tqt_thread;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kthread_stop(thread);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ while (!list_empty(&tq->tq_free_list)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ list_del_init(&t->tqent_list);
+ task_free(tq, t);
+ }
+
+ ASSERT0(tq->tq_nthreads);
+ ASSERT0(tq->tq_nalloc);
+ ASSERT0(tq->tq_nspawn);
+ ASSERT(list_empty(&tq->tq_thread_list));
+ ASSERT(list_empty(&tq->tq_active_list));
+ ASSERT(list_empty(&tq->tq_free_list));
+ ASSERT(list_empty(&tq->tq_pend_list));
+ ASSERT(list_empty(&tq->tq_prio_list));
+ ASSERT(list_empty(&tq->tq_delay_list));
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ strfree(tq->tq_name);
+ kmem_free(tq, sizeof (taskq_t));
+}
+EXPORT_SYMBOL(taskq_destroy);
+
+
+static unsigned int spl_taskq_kick = 0;
+
+/*
+ * 2.6.36 API Change
+ * module_param_cb is introduced to take kernel_param_ops and
+ * module_param_call is marked as obsolete. Also set and get operations
+ * were changed to take a 'const struct kernel_param *'.
+ */
+static int
+#ifdef module_param_cb
+param_set_taskq_kick(const char *val, const struct kernel_param *kp)
+#else
+param_set_taskq_kick(const char *val, struct kernel_param *kp)
+#endif
+{
+ int ret;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ret = param_set_uint(val, kp);
+ if (ret < 0 || !spl_taskq_kick)
+ return (ret);
+ /* reset value */
+ spl_taskq_kick = 0;
+
+ down_read(&tq_list_sem);
+ list_for_each_entry(tq, &tq_list, tq_taskqs) {
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ /* Check if the first pending is older than 5 seconds */
+ t = taskq_next_ent(tq);
+ if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
+ (void) taskq_thread_spawn(tq);
+ printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
+ tq->tq_name, tq->tq_instance);
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+ up_read(&tq_list_sem);
+ return (ret);
+}
+
+#ifdef module_param_cb
+static const struct kernel_param_ops param_ops_taskq_kick = {
+ .set = param_set_taskq_kick,
+ .get = param_get_uint,
+};
+module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
+#else
+module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
+ &spl_taskq_kick, 0644);
+#endif
+MODULE_PARM_DESC(spl_taskq_kick,
+ "Write nonzero to kick stuck taskqs to spawn more threads");
+
+int
+spl_taskq_init(void)
+{
+ tsd_create(&taskq_tsd, NULL);
+
+ system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_taskq == NULL)
+ return (1);
+
+ system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_delay_taskq == NULL) {
+ taskq_destroy(system_taskq);
+ return (1);
+ }
+
+ dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+ if (dynamic_taskq == NULL) {
+ taskq_destroy(system_taskq);
+ taskq_destroy(system_delay_taskq);
+ return (1);
+ }
+
+ /*
+ * This is used to annotate tq_lock, so
+ * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
+ * does not trigger a lockdep warning re: possible recursive locking
+ */
+ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
+
+ return (0);
+}
+
+void
+spl_taskq_fini(void)
+{
+ taskq_destroy(dynamic_taskq);
+ dynamic_taskq = NULL;
+
+ taskq_destroy(system_delay_taskq);
+ system_delay_taskq = NULL;
+
+ taskq_destroy(system_taskq);
+ system_taskq = NULL;
+
+ tsd_destroy(&taskq_tsd);
+}
diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
new file mode 100644
index 000000000..d441ad65f
--- /dev/null
+++ b/module/spl/spl-thread.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Thread Implementation.
+ */
+
+#include <sys/thread.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+typedef struct thread_priv_s {
+ unsigned long tp_magic; /* Magic */
+ int tp_name_size; /* Name size */
+ char *tp_name; /* Name (without _thread suffix) */
+ void (*tp_func)(void *); /* Registered function */
+ void *tp_args; /* Args to be passed to function */
+ size_t tp_len; /* Len to be passed to function */
+ int tp_state; /* State to start thread at */
+ pri_t tp_pri; /* Priority to start threat at */
+} thread_priv_t;
+
+static int
+thread_generic_wrapper(void *arg)
+{
+ thread_priv_t *tp = (thread_priv_t *)arg;
+ void (*func)(void *);
+ void *args;
+
+ ASSERT(tp->tp_magic == TP_MAGIC);
+ func = tp->tp_func;
+ args = tp->tp_args;
+ set_current_state(tp->tp_state);
+ set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+ kmem_free(tp->tp_name, tp->tp_name_size);
+ kmem_free(tp, sizeof (thread_priv_t));
+
+ if (func)
+ func(args);
+
+ return (0);
+}
+
+void
+__thread_exit(void)
+{
+ tsd_exit();
+ complete_and_exit(NULL, 0);
+ /* Unreachable */
+}
+EXPORT_SYMBOL(__thread_exit);
+
+/*
+ * thread_create() may block forever if it cannot create a thread or
+ * allocate memory. This is preferable to returning a NULL which Solaris
+ * style callers likely never check for... since it can't fail.
+ */
+kthread_t *
+__thread_create(caddr_t stk, size_t stksize, thread_func_t func,
+ const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri)
+{
+ thread_priv_t *tp;
+ struct task_struct *tsk;
+ char *p;
+
+ /* Option pp is simply ignored */
+ /* Variable stack size unsupported */
+ ASSERT(stk == NULL);
+
+ tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
+ if (tp == NULL)
+ return (NULL);
+
+ tp->tp_magic = TP_MAGIC;
+ tp->tp_name_size = strlen(name) + 1;
+
+ tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE);
+ if (tp->tp_name == NULL) {
+ kmem_free(tp, sizeof (thread_priv_t));
+ return (NULL);
+ }
+
+ strncpy(tp->tp_name, name, tp->tp_name_size);
+
+ /*
+ * Strip trailing "_thread" from passed name which will be the func
+ * name since the exposed API has no parameter for passing a name.
+ */
+ p = strstr(tp->tp_name, "_thread");
+ if (p)
+ p[0] = '\0';
+
+ tp->tp_func = func;
+ tp->tp_args = args;
+ tp->tp_len = len;
+ tp->tp_state = state;
+ tp->tp_pri = pri;
+
+ tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp,
+ "%s", tp->tp_name);
+ if (IS_ERR(tsk))
+ return (NULL);
+
+ wake_up_process(tsk);
+ return ((kthread_t *)tsk);
+}
+EXPORT_SYMBOL(__thread_create);
+
+/*
+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for
+ * kthread_create() in which it is not killable and less likely
+ * to return -ENOMEM.
+ */
+struct task_struct *
+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
+{
+ struct task_struct *tsk;
+ va_list args;
+ char name[TASK_COMM_LEN];
+
+ va_start(args, namefmt);
+ vsnprintf(name, sizeof (name), namefmt, args);
+ va_end(args);
+ do {
+ tsk = kthread_create(func, data, "%s", name);
+ if (IS_ERR(tsk)) {
+ if (signal_pending(current)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ continue;
+ }
+ if (PTR_ERR(tsk) == -ENOMEM)
+ continue;
+ return (NULL);
+ } else
+ return (tsk);
+ } while (1);
+}
+EXPORT_SYMBOL(spl_kthread_create);
diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c
new file mode 100644
index 000000000..4c800292a
--- /dev/null
+++ b/module/spl/spl-tsd.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Solaris Porting Layer (SPL) Thread Specific Data Implementation.
+ *
+ * Thread specific data has implemented using a hash table, this avoids
+ * the need to add a member to the task structure and allows maximum
+ * portability between kernels. This implementation has been optimized
+ * to keep the tsd_set() and tsd_get() times as small as possible.
+ *
+ * The majority of the entries in the hash table are for specific tsd
+ * entries. These entries are hashed by the product of their key and
+ * pid because by design the key and pid are guaranteed to be unique.
+ * Their product also has the desirable properly that it will be uniformly
+ * distributed over the hash bins providing neither the pid nor key is zero.
+ * Under linux the zero pid is always the init process and thus won't be
+ * used, and this implementation is careful to never to assign a zero key.
+ * By default the hash table is sized to 512 bins which is expected to
+ * be sufficient for light to moderate usage of thread specific data.
+ *
+ * The hash table contains two additional type of entries. They first
+ * type is entry is called a 'key' entry and it is added to the hash during
+ * tsd_create(). It is used to store the address of the destructor function
+ * and it is used as an anchor point. All tsd entries which use the same
+ * key will be linked to this entry. This is used during tsd_destory() to
+ * quickly call the destructor function for all tsd associated with the key.
+ * The 'key' entry may be looked up with tsd_hash_search() by passing the
+ * key you wish to lookup and DTOR_PID constant as the pid.
+ *
+ * The second type of entry is called a 'pid' entry and it is added to the
+ * hash the first time a process set a key. The 'pid' entry is also used
+ * as an anchor and all tsd for the process will be linked to it. This
+ * list is using during tsd_exit() to ensure all registered destructors
+ * are run for the process. The 'pid' entry may be looked up with
+ * tsd_hash_search() by passing the PID_KEY constant as the key, and
+ * the process pid. Note that tsd_exit() is called by thread_exit()
+ * so if your using the Solaris thread API you should not need to call
+ * tsd_exit() directly.
+ *
+ */
+
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/tsd.h>
+#include <linux/hash.h>
+
+typedef struct tsd_hash_bin {
+ spinlock_t hb_lock;
+ struct hlist_head hb_head;
+} tsd_hash_bin_t;
+
+typedef struct tsd_hash_table {
+ spinlock_t ht_lock;
+ uint_t ht_bits;
+ uint_t ht_key;
+ tsd_hash_bin_t *ht_bins;
+} tsd_hash_table_t;
+
+typedef struct tsd_hash_entry {
+ uint_t he_key;
+ pid_t he_pid;
+ dtor_func_t he_dtor;
+ void *he_value;
+ struct hlist_node he_list;
+ struct list_head he_key_list;
+ struct list_head he_pid_list;
+} tsd_hash_entry_t;
+
+static tsd_hash_table_t *tsd_hash_table = NULL;
+
+
+/*
+ * tsd_hash_search - searches hash table for tsd_hash_entry
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static tsd_hash_entry_t *
+tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
+{
+ struct hlist_node *node;
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+ hlist_for_each(node, &bin->hb_head) {
+ entry = list_entry(node, tsd_hash_entry_t, he_list);
+ if ((entry->he_key == key) && (entry->he_pid == pid)) {
+ spin_unlock(&bin->hb_lock);
+ return (entry);
+ }
+ }
+
+ spin_unlock(&bin->hb_lock);
+ return (NULL);
+}
+
+/*
+ * tsd_hash_dtor - call the destructor and free all entries on the list
+ * @work: list of hash entries
+ *
+ * For a list of entries which have all already been removed from the
+ * hash call their registered destructor then free the associated memory.
+ */
+static void
+tsd_hash_dtor(struct hlist_head *work)
+{
+ tsd_hash_entry_t *entry;
+
+ while (!hlist_empty(work)) {
+ entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
+ hlist_del(&entry->he_list);
+
+ if (entry->he_dtor && entry->he_pid != DTOR_PID)
+ entry->he_dtor(entry->he_value);
+
+ kmem_free(entry, sizeof (tsd_hash_entry_t));
+ }
+}
+
+/*
+ * tsd_hash_add - adds an entry to hash table
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ *
+ * The caller is responsible for ensuring the unique key/pid do not
+ * already exist in the hash table. This possible because all entries
+ * are thread specific thus a concurrent thread will never attempt to
+ * add this key/pid. Because multiple bins must be checked to add
+ * links to the dtor and pid entries the entire table is locked.
+ */
+static int
+tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
+{
+ tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int rc = 0;
+
+ ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+
+ /* New entry allocate structure, set value, and add to hash */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ entry->he_key = key;
+ entry->he_pid = pid;
+ entry->he_value = value;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ spin_lock(&table->ht_lock);
+
+ /* Destructor entry must exist for all valid keys */
+ dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
+ ASSERT3P(dtor_entry, !=, NULL);
+ entry->he_dtor = dtor_entry->he_dtor;
+
+ /* Process entry must exist for all valid processes */
+ pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
+ ASSERT3P(pid_entry, !=, NULL);
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ /* Add to the hash, key, and pid lists */
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+ list_add(&entry->he_key_list, &dtor_entry->he_key_list);
+ list_add(&entry->he_pid_list, &pid_entry->he_pid_list);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (rc);
+}
+
+/*
+ * tsd_hash_add_key - adds a destructor entry to the hash table
+ * @table: hash table
+ * @keyp: search key
+ * @dtor: key destructor
+ *
+ * For every unique key there is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this key are linked
+ * to this anchor via the 'he_key_list' list head. On return they keyp
+ * will be set to the next available key for the hash table.
+ */
+static int
+tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
+{
+ tsd_hash_entry_t *tmp_entry, *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int keys_checked = 0;
+
+ ASSERT3P(table, !=, NULL);
+
+ /* Allocate entry to be used as a destructor for this key */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ /* Determine next available key value */
+ spin_lock(&table->ht_lock);
+ do {
+ /* Limited to TSD_KEYS_MAX concurrent unique keys */
+ if (table->ht_key++ > TSD_KEYS_MAX)
+ table->ht_key = 1;
+
+ /* Ensure failure when all TSD_KEYS_MAX keys are in use */
+ if (keys_checked++ >= TSD_KEYS_MAX) {
+ spin_unlock(&table->ht_lock);
+ return (ENOENT);
+ }
+
+ tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
+ } while (tmp_entry);
+
+ /* Add destructor entry in to hash table */
+ entry->he_key = *keyp = table->ht_key;
+ entry->he_pid = DTOR_PID;
+ entry->he_dtor = dtor;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_add_pid - adds a process entry to the hash table
+ * @table: hash table
+ * @pid: search pid
+ *
+ * For every process these is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this process are
+ * linked to this anchor via the 'he_pid_list' list head.
+ */
+static int
+tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
+{
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ /* Allocate entry to be used as the process reference */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ spin_lock(&table->ht_lock);
+ entry->he_key = PID_KEY;
+ entry->he_pid = pid;
+ entry->he_dtor = NULL;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_del - delete an entry from hash table, key, and pid lists
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static void
+tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
+{
+ hlist_del(&entry->he_list);
+ list_del_init(&entry->he_key_list);
+ list_del_init(&entry->he_pid_list);
+}
+
+/*
+ * tsd_hash_table_init - allocate a hash table
+ * @bits: hash table size
+ *
+ * A hash table with 2^bits bins will be created, it may not be resized
+ * after the fact and must be free'd with tsd_hash_table_fini().
+ */
+static tsd_hash_table_t *
+tsd_hash_table_init(uint_t bits)
+{
+ tsd_hash_table_t *table;
+ int hash, size = (1 << bits);
+
+ table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
+ if (table == NULL)
+ return (NULL);
+
+ table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
+ if (table->ht_bins == NULL) {
+ kmem_free(table, sizeof (tsd_hash_table_t));
+ return (NULL);
+ }
+
+ for (hash = 0; hash < size; hash++) {
+ spin_lock_init(&table->ht_bins[hash].hb_lock);
+ INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
+ }
+
+ spin_lock_init(&table->ht_lock);
+ table->ht_bits = bits;
+ table->ht_key = 1;
+
+ return (table);
+}
+
+/*
+ * tsd_hash_table_fini - free a hash table
+ * @table: hash table
+ *
+ * Free a hash table allocated by tsd_hash_table_init(). If the hash
+ * table is not empty this function will call the proper destructor for
+ * all remaining entries before freeing the memory used by those entries.
+ */
+static void
+tsd_hash_table_fini(tsd_hash_table_t *table)
+{
+ HLIST_HEAD(work);
+ tsd_hash_bin_t *bin;
+ tsd_hash_entry_t *entry;
+ int size, i;
+
+ ASSERT3P(table, !=, NULL);
+ spin_lock(&table->ht_lock);
+ for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
+ bin = &table->ht_bins[i];
+ spin_lock(&bin->hb_lock);
+ while (!hlist_empty(&bin->hb_head)) {
+ entry = hlist_entry(bin->hb_head.first,
+ tsd_hash_entry_t, he_list);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ }
+ spin_unlock(&bin->hb_lock);
+ }
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+ kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+ ASSERT3P(entry, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ /* save the possible pid_entry */
+ pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+ he_pid_list);
+
+ /* remove entry */
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+
+ /* if pid_entry is indeed pid_entry, then remove it if it's empty */
+ if (pid_entry->he_key == PID_KEY &&
+ list_empty(&pid_entry->he_pid_list)) {
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ }
+
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+
+/*
+ * tsd_set - set thread specific data
+ * @key: lookup key
+ * @value: value to set
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(), protected
+ * from racing tsd_get() or tsd_set() because it is thread specific.
+ * This function has been optimized to be fast for the update case.
+ * When setting the tsd initially it will be slower due to additional
+ * required locking and potential memory allocations.
+ */
+int
+tsd_set(uint_t key, void *value)
+{
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *entry;
+ pid_t pid;
+ int rc;
+ /* mark remove if value is NULL */
+ boolean_t remove = (value == NULL);
+
+ table = tsd_hash_table;
+ pid = curthread->pid;
+ ASSERT3P(table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (EINVAL);
+
+ /* Entry already exists in hash table update value */
+ entry = tsd_hash_search(table, key, pid);
+ if (entry) {
+ entry->he_value = value;
+ /* remove the entry */
+ if (remove)
+ tsd_remove_entry(entry);
+ return (0);
+ }
+
+ /* don't create entry if value is NULL */
+ if (remove)
+ return (0);
+
+ /* Add a process entry to the hash if not yet exists */
+ entry = tsd_hash_search(table, PID_KEY, pid);
+ if (entry == NULL) {
+ rc = tsd_hash_add_pid(table, pid);
+ if (rc)
+ return (rc);
+ }
+
+ rc = tsd_hash_add(table, key, pid, value);
+ return (rc);
+}
+EXPORT_SYMBOL(tsd_set);
+
+/*
+ * tsd_get - get thread specific data
+ * @key: lookup key
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get(uint_t key)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get);
+
+/*
+ * tsd_get_by_thread - get thread specific data for specified thread
+ * @key: lookup key
+ * @thread: thread to lookup
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get_by_thread(uint_t key, kthread_t *thread)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get_by_thread);
+
+/*
+ * tsd_create - create thread specific data key
+ * @keyp: lookup key address
+ * @dtor: destructor called during tsd_destroy() or tsd_exit()
+ *
+ * Provided key must be set to 0 or it assumed to be already in use.
+ * The dtor is allowed to be NULL in which case no additional cleanup
+ * for the data is performed during tsd_destroy() or tsd_exit().
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_create(uint_t *keyp, dtor_func_t dtor)
+{
+ ASSERT3P(keyp, !=, NULL);
+ if (*keyp)
+ return;
+
+ (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+}
+EXPORT_SYMBOL(tsd_create);
+
+/*
+ * tsd_destroy - destroy thread specific data
+ * @keyp: lookup key address
+ *
+ * Destroys the thread specific data on all threads which use this key.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_destroy(uint_t *keyp)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *dtor_entry, *entry;
+ tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
+ if (dtor_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All threads which use this key must be linked off of the
+ * DTOR_PID entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+ while (!list_empty(&dtor_entry->he_key_list)) {
+ entry = list_entry(dtor_entry->he_key_list.next,
+ tsd_hash_entry_t, he_key_list);
+ ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
+ ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)dtor_entry->he_key *
+ (ulong_t)dtor_entry->he_pid, table->ht_bits);
+ dtor_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&dtor_entry_bin->hb_lock);
+ tsd_hash_del(table, dtor_entry);
+ hlist_add_head(&dtor_entry->he_list, &work);
+ spin_unlock(&dtor_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ *keyp = 0;
+}
+EXPORT_SYMBOL(tsd_destroy);
+
+/*
+ * tsd_exit - destroys all thread specific data for this thread
+ *
+ * Destroys all the thread specific data for this thread.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_exit(void)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry, *entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
+ if (pid_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All keys associated with this pid must be linked off of the
+ * PID_KEY entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+
+ while (!list_empty(&pid_entry->he_pid_list)) {
+ entry = list_entry(pid_entry->he_pid_list.next,
+ tsd_hash_entry_t, he_pid_list);
+ ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+EXPORT_SYMBOL(tsd_exit);
+
+int
+spl_tsd_init(void)
+{
+ tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
+ if (tsd_hash_table == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_tsd_fini(void)
+{
+ tsd_hash_table_fini(tsd_hash_table);
+ tsd_hash_table = NULL;
+}
diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c
new file mode 100644
index 000000000..e1a84a911
--- /dev/null
+++ b/module/spl/spl-vmem.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <linux/module.h>
+
+vmem_t *heap_arena = NULL;
+EXPORT_SYMBOL(heap_arena);
+
+vmem_t *zio_alloc_arena = NULL;
+EXPORT_SYMBOL(zio_alloc_arena);
+
+vmem_t *zio_arena = NULL;
+EXPORT_SYMBOL(zio_arena);
+
+#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */
+
+/*
+ * Return approximate virtual memory usage based on these assumptions:
+ *
+ * 1) The major SPL consumer of virtual memory is the kmem cache.
+ * 2) Memory allocated with vmem_alloc() is short lived and can be ignored.
+ * 3) Allow a 4MB floor as a generous pad given normal consumption.
+ * 4) The spl_kmem_cache_sem only contends with cache create/destroy.
+ */
+size_t
+vmem_size(vmem_t *vmp, int typemask)
+{
+ spl_kmem_cache_t *skc;
+ size_t alloc = VMEM_FLOOR_SIZE;
+
+ if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE))
+ return (VMALLOC_TOTAL);
+
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ if (skc->skc_flags & KMC_VMEM)
+ alloc += skc->skc_slab_size * skc->skc_slab_total;
+ }
+ up_read(&spl_kmem_cache_sem);
+
+ if (typemask & VMEM_ALLOC)
+ return (MIN(alloc, VMALLOC_TOTAL));
+ else if (typemask & VMEM_FREE)
+ return (MAX(VMALLOC_TOTAL - alloc, 0));
+ else
+ return (0);
+}
+EXPORT_SYMBOL(vmem_size);
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+int
+spl_vmem_init(void)
+{
+ return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c
new file mode 100644
index 000000000..28ce21276
--- /dev/null
+++ b/module/spl/spl-vnode.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Vnode Implementation.
+ */
+
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/kmem_cache.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_FDTABLE_HEADER
+#include <linux/fdtable.h>
+#endif
+
+vnode_t *rootdir = (vnode_t *)0xabcd1234;
+EXPORT_SYMBOL(rootdir);
+
+static spl_kmem_cache_t *vn_cache;
+static spl_kmem_cache_t *vn_file_cache;
+
+static DEFINE_SPINLOCK(vn_file_lock);
+static LIST_HEAD(vn_file_list);
+
+static int
+spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len)
+{
+ int error = -EOPNOTSUPP;
+
+#ifdef HAVE_FILE_FALLOCATE
+ if (fp->f_op->fallocate)
+ error = fp->f_op->fallocate(fp, mode, offset, len);
+#else
+#ifdef HAVE_INODE_FALLOCATE
+ if (fp->f_dentry && fp->f_dentry->d_inode &&
+ fp->f_dentry->d_inode->i_op->fallocate)
+ error = fp->f_dentry->d_inode->i_op->fallocate(
+ fp->f_dentry->d_inode, mode, offset, len);
+#endif /* HAVE_INODE_FALLOCATE */
+#endif /* HAVE_FILE_FALLOCATE */
+
+ return (error);
+}
+
+static int
+spl_filp_fsync(struct file *fp, int sync)
+{
+#ifdef HAVE_2ARGS_VFS_FSYNC
+ return (vfs_fsync(fp, sync));
+#else
+ return (vfs_fsync(fp, (fp)->f_dentry, sync));
+#endif /* HAVE_2ARGS_VFS_FSYNC */
+}
+
+static ssize_t
+spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_WRITE_PPOS)
+ return (kernel_write(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(get_ds());
+
+ ret = vfs_write(file, (__force const char __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+static ssize_t
+spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+ return (kernel_read(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(get_ds());
+
+ ret = vfs_read(file, (void __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+vtype_t
+vn_mode_to_vtype(mode_t mode)
+{
+ if (S_ISREG(mode))
+ return (VREG);
+
+ if (S_ISDIR(mode))
+ return (VDIR);
+
+ if (S_ISCHR(mode))
+ return (VCHR);
+
+ if (S_ISBLK(mode))
+ return (VBLK);
+
+ if (S_ISFIFO(mode))
+ return (VFIFO);
+
+ if (S_ISLNK(mode))
+ return (VLNK);
+
+ if (S_ISSOCK(mode))
+ return (VSOCK);
+
+ return (VNON);
+} /* vn_mode_to_vtype() */
+EXPORT_SYMBOL(vn_mode_to_vtype);
+
+mode_t
+vn_vtype_to_mode(vtype_t vtype)
+{
+ if (vtype == VREG)
+ return (S_IFREG);
+
+ if (vtype == VDIR)
+ return (S_IFDIR);
+
+ if (vtype == VCHR)
+ return (S_IFCHR);
+
+ if (vtype == VBLK)
+ return (S_IFBLK);
+
+ if (vtype == VFIFO)
+ return (S_IFIFO);
+
+ if (vtype == VLNK)
+ return (S_IFLNK);
+
+ if (vtype == VSOCK)
+ return (S_IFSOCK);
+
+ return (VNON);
+} /* vn_vtype_to_mode() */
+EXPORT_SYMBOL(vn_vtype_to_mode);
+
+vnode_t *
+vn_alloc(int flag)
+{
+ vnode_t *vp;
+
+ vp = kmem_cache_alloc(vn_cache, flag);
+ if (vp != NULL) {
+ vp->v_file = NULL;
+ vp->v_type = 0;
+ }
+
+ return (vp);
+} /* vn_alloc() */
+EXPORT_SYMBOL(vn_alloc);
+
+void
+vn_free(vnode_t *vp)
+{
+ kmem_cache_free(vn_cache, vp);
+} /* vn_free() */
+EXPORT_SYMBOL(vn_free);
+
+int
+vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp,
+ int x1, void *x2)
+{
+ struct file *fp;
+ struct kstat stat;
+ int rc, saved_umask = 0;
+ gfp_t saved_gfp;
+ vnode_t *vp;
+
+ ASSERT(flags & (FWRITE | FREAD));
+ ASSERT(seg == UIO_SYSSPACE);
+ ASSERT(vpp);
+ *vpp = NULL;
+
+ if (!(flags & FCREAT) && (flags & FWRITE))
+ flags |= FEXCL;
+
+ /*
+ * Note for filp_open() the two low bits must be remapped to mean:
+ * 01 - read-only -> 00 read-only
+ * 10 - write-only -> 01 write-only
+ * 11 - read-write -> 10 read-write
+ */
+ flags--;
+
+ if (flags & FCREAT)
+ saved_umask = xchg(&current->fs->umask, 0);
+
+ fp = filp_open(path, flags, mode);
+
+ if (flags & FCREAT)
+ (void) xchg(&current->fs->umask, saved_umask);
+
+ if (IS_ERR(fp))
+ return (-PTR_ERR(fp));
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat);
+#else
+ rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat);
+#endif
+ if (rc) {
+ filp_close(fp, 0);
+ return (-rc);
+ }
+
+ vp = vn_alloc(KM_SLEEP);
+ if (!vp) {
+ filp_close(fp, 0);
+ return (ENOMEM);
+ }
+
+ saved_gfp = mapping_gfp_mask(fp->f_mapping);
+ mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS));
+
+ mutex_enter(&vp->v_lock);
+ vp->v_type = vn_mode_to_vtype(stat.mode);
+ vp->v_file = fp;
+ vp->v_gfp_mask = saved_gfp;
+ *vpp = vp;
+ mutex_exit(&vp->v_lock);
+
+ return (0);
+} /* vn_open() */
+EXPORT_SYMBOL(vn_open);
+
+int
+vn_openat(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd)
+{
+ char *realpath;
+ int len, rc;
+
+ ASSERT(vp == rootdir);
+
+ len = strlen(path) + 2;
+ realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP));
+ if (!realpath)
+ return (ENOMEM);
+
+ (void) snprintf(realpath, len, "/%s", path);
+ rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2);
+ kfree(realpath);
+
+ return (rc);
+} /* vn_openat() */
+EXPORT_SYMBOL(vn_openat);
+
+int
+vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off,
+ uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp)
+{
+ struct file *fp = vp->v_file;
+ loff_t offset = off;
+ int rc;
+
+ ASSERT(uio == UIO_WRITE || uio == UIO_READ);
+ ASSERT(seg == UIO_SYSSPACE);
+ ASSERT((ioflag & ~FAPPEND) == 0);
+
+ if (ioflag & FAPPEND)
+ offset = fp->f_pos;
+
+ if (uio & UIO_WRITE)
+ rc = spl_kernel_write(fp, addr, len, &offset);
+ else
+ rc = spl_kernel_read(fp, addr, len, &offset);
+
+ fp->f_pos = offset;
+
+ if (rc < 0)
+ return (-rc);
+
+ if (residp) {
+ *residp = len - rc;
+ } else {
+ if (rc != len)
+ return (EIO);
+ }
+
+ return (0);
+} /* vn_rdwr() */
+EXPORT_SYMBOL(vn_rdwr);
+
+int
+vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4)
+{
+ int rc;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+
+ mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask);
+ rc = filp_close(vp->v_file, 0);
+ vn_free(vp);
+
+ return (-rc);
+} /* vn_close() */
+EXPORT_SYMBOL(vn_close);
+
+/*
+ * vn_seek() does not actually seek it only performs bounds checking on the
+ * proposed seek. We perform minimal checking and allow vn_rdwr() to catch
+ * anything more serious.
+ */
+int
+vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct)
+{
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+EXPORT_SYMBOL(vn_seek);
+
+int
+vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4)
+{
+ struct file *fp;
+ struct kstat stat;
+ int rc;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+ ASSERT(vap);
+
+ fp = vp->v_file;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat);
+#else
+ rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat);
+#endif
+ if (rc)
+ return (-rc);
+
+ vap->va_type = vn_mode_to_vtype(stat.mode);
+ vap->va_mode = stat.mode;
+ vap->va_uid = KUID_TO_SUID(stat.uid);
+ vap->va_gid = KGID_TO_SGID(stat.gid);
+ vap->va_fsid = 0;
+ vap->va_nodeid = stat.ino;
+ vap->va_nlink = stat.nlink;
+ vap->va_size = stat.size;
+ vap->va_blksize = stat.blksize;
+ vap->va_atime = stat.atime;
+ vap->va_mtime = stat.mtime;
+ vap->va_ctime = stat.ctime;
+ vap->va_rdev = stat.rdev;
+ vap->va_nblocks = stat.blocks;
+
+ return (0);
+}
+EXPORT_SYMBOL(vn_getattr);
+
+int
+vn_fsync(vnode_t *vp, int flags, void *x3, void *x4)
+{
+ int datasync = 0;
+ int error;
+ int fstrans;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+
+ if (flags & FDSYNC)
+ datasync = 1;
+
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ error = -spl_filp_fsync(vp->v_file, datasync);
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ return (error);
+} /* vn_fsync() */
+EXPORT_SYMBOL(vn_fsync);
+
+int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
+ offset_t offset, void *x6, void *x7)
+{
+ int error = EOPNOTSUPP;
+#ifdef FALLOC_FL_PUNCH_HOLE
+ int fstrans;
+#endif
+
+ if (cmd != F_FREESP || bfp->l_whence != 0)
+ return (EOPNOTSUPP);
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+ ASSERT(bfp->l_start >= 0 && bfp->l_len > 0);
+
+#ifdef FALLOC_FL_PUNCH_HOLE
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ /*
+ * When supported by the underlying file system preferentially
+ * use the fallocate() callback to preallocate the space.
+ */
+ error = -spl_filp_fallocate(vp->v_file,
+ FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ bfp->l_start, bfp->l_len);
+
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ if (error == 0)
+ return (0);
+#endif
+
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+ if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode &&
+ vp->v_file->f_dentry->d_inode->i_op &&
+ vp->v_file->f_dentry->d_inode->i_op->truncate_range) {
+ off_t end = bfp->l_start + bfp->l_len;
+ /*
+ * Judging from the code in shmem_truncate_range(),
+ * it seems the kernel expects the end offset to be
+ * inclusive and aligned to the end of a page.
+ */
+ if (end % PAGE_SIZE != 0) {
+ end &= ~(off_t)(PAGE_SIZE - 1);
+ if (end <= bfp->l_start)
+ return (0);
+ }
+ --end;
+
+ vp->v_file->f_dentry->d_inode->i_op->truncate_range(
+ vp->v_file->f_dentry->d_inode, bfp->l_start, end);
+
+ return (0);
+ }
+#endif
+
+ return (error);
+}
+EXPORT_SYMBOL(vn_space);
+
+/* Function must be called while holding the vn_file_lock */
+static file_t *
+file_find(int fd, struct task_struct *task)
+{
+ file_t *fp;
+
+ list_for_each_entry(fp, &vn_file_list, f_list) {
+ if (fd == fp->f_fd && fp->f_task == task) {
+ ASSERT(atomic_read(&fp->f_ref) != 0);
+ return (fp);
+ }
+ }
+
+ return (NULL);
+} /* file_find() */
+
+file_t *
+vn_getf(int fd)
+{
+ struct kstat stat;
+ struct file *lfp;
+ file_t *fp;
+ vnode_t *vp;
+ int rc = 0;
+
+ if (fd < 0)
+ return (NULL);
+
+ /* Already open just take an extra reference */
+ spin_lock(&vn_file_lock);
+
+ fp = file_find(fd, current);
+ if (fp) {
+ lfp = fget(fd);
+ fput(fp->f_file);
+ /*
+ * areleasef() can cause us to see a stale reference when
+ * userspace has reused a file descriptor before areleasef()
+ * has run. fput() the stale reference and replace it. We
+ * retain the original reference count such that the concurrent
+ * areleasef() will decrement its reference and terminate.
+ */
+ if (lfp != fp->f_file) {
+ fp->f_file = lfp;
+ fp->f_vnode->v_file = lfp;
+ }
+ atomic_inc(&fp->f_ref);
+ spin_unlock(&vn_file_lock);
+ return (fp);
+ }
+
+ spin_unlock(&vn_file_lock);
+
+ /* File was not yet opened create the object and setup */
+ fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP);
+ if (fp == NULL)
+ goto out;
+
+ mutex_enter(&fp->f_lock);
+
+ fp->f_fd = fd;
+ fp->f_task = current;
+ fp->f_offset = 0;
+ atomic_inc(&fp->f_ref);
+
+ lfp = fget(fd);
+ if (lfp == NULL)
+ goto out_mutex;
+
+ vp = vn_alloc(KM_SLEEP);
+ if (vp == NULL)
+ goto out_fget;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&lfp->f_path, &stat);
+#else
+ rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat);
+#endif
+ if (rc)
+ goto out_vnode;
+
+ mutex_enter(&vp->v_lock);
+ vp->v_type = vn_mode_to_vtype(stat.mode);
+ vp->v_file = lfp;
+ mutex_exit(&vp->v_lock);
+
+ fp->f_vnode = vp;
+ fp->f_file = lfp;
+
+ /* Put it on the tracking list */
+ spin_lock(&vn_file_lock);
+ list_add(&fp->f_list, &vn_file_list);
+ spin_unlock(&vn_file_lock);
+
+ mutex_exit(&fp->f_lock);
+ return (fp);
+
+out_vnode:
+ vn_free(vp);
+out_fget:
+ fput(lfp);
+out_mutex:
+ mutex_exit(&fp->f_lock);
+ kmem_cache_free(vn_file_cache, fp);
+out:
+ return (NULL);
+} /* getf() */
+EXPORT_SYMBOL(getf);
+
+static void releasef_locked(file_t *fp)
+{
+ ASSERT(fp->f_file);
+ ASSERT(fp->f_vnode);
+
+ /* Unlinked from list, no refs, safe to free outside mutex */
+ fput(fp->f_file);
+ vn_free(fp->f_vnode);
+
+ kmem_cache_free(vn_file_cache, fp);
+}
+
+void
+vn_releasef(int fd)
+{
+ areleasef(fd, P_FINFO(current));
+}
+EXPORT_SYMBOL(releasef);
+
+void
+vn_areleasef(int fd, uf_info_t *fip)
+{
+ file_t *fp;
+ struct task_struct *task = (struct task_struct *)fip;
+
+ if (fd < 0)
+ return;
+
+ spin_lock(&vn_file_lock);
+ fp = file_find(fd, task);
+ if (fp) {
+ atomic_dec(&fp->f_ref);
+ if (atomic_read(&fp->f_ref) > 0) {
+ spin_unlock(&vn_file_lock);
+ return;
+ }
+
+ list_del(&fp->f_list);
+ releasef_locked(fp);
+ }
+ spin_unlock(&vn_file_lock);
+} /* releasef() */
+EXPORT_SYMBOL(areleasef);
+
+
+static void
+#ifdef HAVE_SET_FS_PWD_WITH_CONST
+vn_set_fs_pwd(struct fs_struct *fs, const struct path *path)
+#else
+vn_set_fs_pwd(struct fs_struct *fs, struct path *path)
+#endif /* HAVE_SET_FS_PWD_WITH_CONST */
+{
+ struct path old_pwd;
+
+#ifdef HAVE_FS_STRUCT_SPINLOCK
+ spin_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ fs->pwd = *path;
+ path_get(path);
+ spin_unlock(&fs->lock);
+#else
+ write_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ fs->pwd = *path;
+ path_get(path);
+ write_unlock(&fs->lock);
+#endif /* HAVE_FS_STRUCT_SPINLOCK */
+
+ if (old_pwd.dentry)
+ path_put(&old_pwd);
+}
+
+int
+vn_set_pwd(const char *filename)
+{
+ struct path path;
+ mm_segment_t saved_fs;
+ int rc;
+
+ /*
+ * user_path_dir() and __user_walk() both expect 'filename' to be
+ * a user space address so we must briefly increase the data segment
+ * size to ensure strncpy_from_user() does not fail with -EFAULT.
+ */
+ saved_fs = get_fs();
+ set_fs(get_ds());
+
+ rc = user_path_dir(filename, &path);
+ if (rc)
+ goto out;
+
+ rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
+ if (rc)
+ goto dput_and_out;
+
+ vn_set_fs_pwd(current->fs, &path);
+
+dput_and_out:
+ path_put(&path);
+out:
+ set_fs(saved_fs);
+
+ return (-rc);
+} /* vn_set_pwd() */
+EXPORT_SYMBOL(vn_set_pwd);
+
+static int
+vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct vnode *vp = buf;
+
+ mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+} /* vn_cache_constructor() */
+
+static void
+vn_cache_destructor(void *buf, void *cdrarg)
+{
+ struct vnode *vp = buf;
+
+ mutex_destroy(&vp->v_lock);
+} /* vn_cache_destructor() */
+
+static int
+vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ file_t *fp = buf;
+
+ atomic_set(&fp->f_ref, 0);
+ mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&fp->f_list);
+
+ return (0);
+} /* vn_file_cache_constructor() */
+
+static void
+vn_file_cache_destructor(void *buf, void *cdrarg)
+{
+ file_t *fp = buf;
+
+ mutex_destroy(&fp->f_lock);
+} /* vn_file_cache_destructor() */
+
+int
+spl_vn_init(void)
+{
+ vn_cache = kmem_cache_create("spl_vn_cache",
+ sizeof (struct vnode), 64, vn_cache_constructor,
+ vn_cache_destructor, NULL, NULL, NULL, 0);
+
+ vn_file_cache = kmem_cache_create("spl_vn_file_cache",
+ sizeof (file_t), 64, vn_file_cache_constructor,
+ vn_file_cache_destructor, NULL, NULL, NULL, 0);
+
+ return (0);
+} /* spl_vn_init() */
+
+void
+spl_vn_fini(void)
+{
+ file_t *fp, *next_fp;
+ int leaked = 0;
+
+ spin_lock(&vn_file_lock);
+
+ list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) {
+ list_del(&fp->f_list);
+ releasef_locked(fp);
+ leaked++;
+ }
+
+ spin_unlock(&vn_file_lock);
+
+ if (leaked > 0)
+ printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked);
+
+ kmem_cache_destroy(vn_file_cache);
+ kmem_cache_destroy(vn_cache);
+} /* spl_vn_fini() */
diff --git a/module/spl/spl-xdr.c b/module/spl/spl-xdr.c
new file mode 100644
index 000000000..2cc3e2a03
--- /dev/null
+++ b/module/spl/spl-xdr.c
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2008-2010 Sun Microsystems, Inc.
+ * Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) XDR Implementation.
+ */
+
+#include <linux/string.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <rpc/xdr.h>
+
+/*
+ * SPL's XDR mem implementation.
+ *
+ * This is used by libnvpair to serialize/deserialize the name-value pair data
+ * structures into byte arrays in a well-defined and portable manner.
+ *
+ * These data structures are used by the DMU/ZFS to flexibly manipulate various
+ * information in memory and later serialize it/deserialize it to disk.
+ * Examples of usages include the pool configuration, lists of pool and dataset
+ * properties, etc.
+ *
+ * Reference documentation for the XDR representation and XDR operations can be
+ * found in RFC 1832 and xdr(3), respectively.
+ *
+ * === Implementation shortcomings ===
+ *
+ * It is assumed that the following C types have the following sizes:
+ *
+ * char/unsigned char: 1 byte
+ * short/unsigned short: 2 bytes
+ * int/unsigned int: 4 bytes
+ * longlong_t/u_longlong_t: 8 bytes
+ *
+ * The C standard allows these types to be larger (and in the case of ints,
+ * shorter), so if that is the case on some compiler/architecture, the build
+ * will fail (on purpose).
+ *
+ * If someone wants to fix the code to work properly on such environments, then:
+ *
+ * 1) Preconditions should be added to xdrmem_enc functions to make sure the
+ * caller doesn't pass arguments which exceed the expected range.
+ * 2) Functions which take signed integers should be changed to properly do
+ * sign extension.
+ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger
+ * problems than this implementation.
+ *
+ * It is also assumed that:
+ *
+ * 1) Chars have 8 bits.
+ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned
+ * memcpy, memset and memcmp.
+ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture
+ * supports element-sized-aligned memory accesses.
+ * 4) Negative integers are natively stored in two's complement binary
+ * representation.
+ *
+ * No checks are done for the 4 assumptions above, though.
+ *
+ * === Caller expectations ===
+ *
+ * Existing documentation does not describe the semantics of XDR operations very
+ * well. Therefore, some assumptions about failure semantics will be made and
+ * will be described below:
+ *
+ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the
+ * the stream should be considered valid only up to the encoding operation
+ * previous to the one that first failed. However, the stream size as returned
+ * by xdr_control() cannot be considered to be strictly correct (it may be
+ * bigger).
+ *
+ * Putting it another way, if there is an encoding failure it's undefined
+ * whether anything is added to the stream in that operation and therefore
+ * neither xdr_control() nor future encoding operations on the same stream can
+ * be relied upon to produce correct results.
+ *
+ * 2) If a decoding operation fails, it's undefined whether anything will be
+ * decoded into passed buffers/pointers during that operation, or what the
+ * values on those buffers will look like.
+ *
+ * Future decoding operations on the same stream will also have similar
+ * undefined behavior.
+ *
+ * 3) When the first decoding operation fails it is OK to trust the results of
+ * previous decoding operations on the same stream, as long as the caller
+ * expects a failure to be possible (e.g. due to end-of-stream).
+ *
+ * However, this is highly discouraged because the caller should know the
+ * stream size and should be coded to expect any decoding failure to be data
+ * corruption due to hardware, accidental or even malicious causes, which should
+ * be handled gracefully in all cases.
+ *
+ * In very rare situations where there are strong reasons to believe the data
+ * can be trusted to be valid and non-tampered with, then the caller may assume
+ * a decoding failure to be a bug (e.g. due to mismatched data types) and may
+ * fail non-gracefully.
+ *
+ * 4) Non-zero padding bytes will cause the decoding operation to fail.
+ *
+ * 5) Zero bytes on string types will also cause the decoding operation to fail.
+ *
+ * 6) It is assumed that either the pointer to the stream buffer given by the
+ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int
+ * memory accesses.
+ *
+ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap.
+ *
+ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user
+ * space or MMIO space), the computer may explode.
+ */
+
+static struct xdr_ops xdrmem_encode_ops;
+static struct xdr_ops xdrmem_decode_ops;
+
+typedef int bool_t;
+
+void
+xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+ const enum xdr_op op)
+{
+ switch (op) {
+ case XDR_ENCODE:
+ xdrs->x_ops = &xdrmem_encode_ops;
+ break;
+ case XDR_DECODE:
+ xdrs->x_ops = &xdrmem_decode_ops;
+ break;
+ default:
+ xdrs->x_ops = NULL; /* Let the caller know we failed */
+ return;
+ }
+
+ xdrs->x_op = op;
+ xdrs->x_addr = addr;
+ xdrs->x_addr_end = addr + size;
+
+ if (xdrs->x_addr_end < xdrs->x_addr) {
+ xdrs->x_ops = NULL;
+ }
+}
+EXPORT_SYMBOL(xdrmem_create);
+
+static bool_t
+xdrmem_control(XDR *xdrs, int req, void *info)
+{
+ struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info;
+
+ if (req != XDR_GET_BYTES_AVAIL)
+ return (FALSE);
+
+ rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */
+ rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(xdrs->x_addr, cp, cnt);
+
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ memset(xdrs->x_addr, 0, pad);
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ static uint32_t zero = 0;
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(cp, xdrs->x_addr, cnt);
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ /* An inverted memchr() would be useful here... */
+ if (memcmp(&zero, xdrs->x_addr, pad) != 0)
+ return (FALSE);
+
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint32(XDR *xdrs, uint32_t val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val);
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_uint32(XDR *xdrs, uint32_t *val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr));
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+ val = *((unsigned char *) cp);
+
+ return (xdrmem_enc_uint32(xdrs, val));
+}
+
+static bool_t
+xdrmem_dec_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * If any of the 3 other bytes are non-zero then val will be greater
+ * than 0xff and we fail because according to the RFC, this block does
+ * not have a char encoded in it.
+ */
+ if (val > 0xff)
+ return (FALSE);
+
+ *((unsigned char *) cp) = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp)
+{
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ return (xdrmem_enc_uint32(xdrs, *usp));
+}
+
+static bool_t
+xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * Short ints are not in the RFC, but we assume similar logic as in
+ * xdrmem_dec_char().
+ */
+ if (val > 0xffff)
+ return (FALSE);
+
+ *usp = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_enc_uint32(xdrs, *up));
+}
+
+static bool_t
+xdrmem_dec_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_dec_uint32(xdrs, (uint32_t *)up));
+}
+
+static bool_t
+xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_enc_uint32(xdrs, *ullp >> 32))
+ return (FALSE);
+
+ return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff));
+}
+
+static bool_t
+xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ uint32_t low, high;
+
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_dec_uint32(xdrs, &high))
+ return (FALSE);
+ if (!xdrmem_dec_uint32(xdrs, &low))
+ return (FALSE);
+
+ *ullp = ((u_longlong_t)high << 32) | low;
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i;
+ caddr_t addr = *arrp;
+
+ if (*sizep > maxsize || *sizep > UINT_MAX / elsize)
+ return (FALSE);
+
+ if (!xdrmem_enc_uint(xdrs, sizep))
+ return (FALSE);
+
+ for (i = 0; i < *sizep; i++) {
+ if (!elproc(xdrs, addr))
+ return (FALSE);
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i, size;
+ bool_t alloc = FALSE;
+ caddr_t addr;
+
+ if (!xdrmem_dec_uint(xdrs, sizep))
+ return (FALSE);
+
+ size = *sizep;
+
+ if (size > maxsize || size > UINT_MAX / elsize)
+ return (FALSE);
+
+ /*
+ * The Solaris man page says: "If *arrp is NULL when decoding,
+ * xdr_array() allocates memory and *arrp points to it".
+ */
+ if (*arrp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *arrp = kmem_alloc(size * elsize, KM_NOSLEEP);
+ if (*arrp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ addr = *arrp;
+
+ for (i = 0; i < size; i++) {
+ if (!elproc(xdrs, addr)) {
+ if (alloc)
+ kmem_free(*arrp, size * elsize);
+ return (FALSE);
+ }
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ size_t slen = strlen(*sp);
+ uint_t len;
+
+ if (slen > maxsize)
+ return (FALSE);
+
+ len = slen;
+
+ if (!xdrmem_enc_uint(xdrs, &len))
+ return (FALSE);
+
+ return (xdrmem_enc_bytes(xdrs, *sp, len));
+}
+
+static bool_t
+xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ uint_t size;
+ bool_t alloc = FALSE;
+
+ if (!xdrmem_dec_uint(xdrs, &size))
+ return (FALSE);
+
+ if (size > maxsize || size > UINT_MAX - 1)
+ return (FALSE);
+
+ /*
+ * Solaris man page: "If *sp is NULL when decoding, xdr_string()
+ * allocates memory and *sp points to it".
+ */
+ if (*sp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *sp = kmem_alloc(size + 1, KM_NOSLEEP);
+ if (*sp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ if (!xdrmem_dec_bytes(xdrs, *sp, size))
+ goto fail;
+
+ if (memchr(*sp, 0, size) != NULL)
+ goto fail;
+
+ (*sp)[size] = '\0';
+
+ return (TRUE);
+
+fail:
+ if (alloc)
+ kmem_free(*sp, size + 1);
+
+ return (FALSE);
+}
+
+static struct xdr_ops xdrmem_encode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_enc_char,
+ .xdr_u_short = xdrmem_enc_ushort,
+ .xdr_u_int = xdrmem_enc_uint,
+ .xdr_u_longlong_t = xdrmem_enc_ulonglong,
+ .xdr_opaque = xdrmem_enc_bytes,
+ .xdr_string = xdr_enc_string,
+ .xdr_array = xdr_enc_array
+};
+
+static struct xdr_ops xdrmem_decode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_dec_char,
+ .xdr_u_short = xdrmem_dec_ushort,
+ .xdr_u_int = xdrmem_dec_uint,
+ .xdr_u_longlong_t = xdrmem_dec_ulonglong,
+ .xdr_opaque = xdrmem_dec_bytes,
+ .xdr_string = xdr_dec_string,
+ .xdr_array = xdr_dec_array
+};
diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c
new file mode 100644
index 000000000..229e6a44b
--- /dev/null
+++ b/module/spl/spl-zlib.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * z_compress_level/z_uncompress are nearly identical copies of the
+ * compress2/uncompress functions provided by the official zlib package
+ * available at http://zlib.net/. The only changes made we to slightly
+ * adapt the functions called to match the linux kernel implementation
+ * of zlib. The full zlib license follows:
+ *
+ * zlib.h -- interface of the 'zlib' general purpose compression library
+ * version 1.2.5, April 19th, 2010
+ *
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * Jean-loup Gailly
+ * Mark Adler
+ */
+
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+
+static spl_kmem_cache_t *zlib_workspace_cache;
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call. Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use. This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+ return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)));
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+ kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit. sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_deflateInit(&stream, level);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_compress_level);
+
+/*
+ * Decompresses the source buffer into the destination buffer. sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ if (err == Z_NEED_DICT ||
+ (err == Z_BUF_ERROR && stream.avail_in == 0))
+ return (Z_DATA_ERROR);
+
+ return (err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_uncompress);
+
+int
+spl_zlib_init(void)
+{
+ int size;
+
+ size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+
+ zlib_workspace_cache = kmem_cache_create(
+ "spl_zlib_workspace_cache",
+ size, 0, NULL, NULL, NULL, NULL, NULL,
+ KMC_VMEM | KMC_NOEMERGENCY);
+ if (!zlib_workspace_cache)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+ kmem_cache_destroy(zlib_workspace_cache);
+ zlib_workspace_cache = NULL;
+}