diff options
98 files changed, 16031 insertions, 0 deletions
diff --git a/config/kernel-ctl-table-name.m4 b/config/kernel-ctl-table-name.m4 new file mode 100644 index 000000000..8dd2e77cb --- /dev/null +++ b/config/kernel-ctl-table-name.m4 @@ -0,0 +1,18 @@ +dnl # +dnl # 2.6.33 API change, +dnl # Removed .ctl_name from struct ctl_table. +dnl # +AC_DEFUN([SPL_AC_CTL_NAME], [ + AC_MSG_CHECKING([whether struct ctl_table has ctl_name]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/sysctl.h> + ],[ + struct ctl_table ctl __attribute__ ((unused)); + ctl.ctl_name = 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CTL_NAME, 1, [struct ctl_table has ctl_name]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-fallocate-pax.m4 b/config/kernel-fallocate-pax.m4 new file mode 100644 index 000000000..ac75a4c8e --- /dev/null +++ b/config/kernel-fallocate-pax.m4 @@ -0,0 +1,19 @@ +dnl # +dnl # PaX Linux 2.6.38 - 3.x API +dnl # +AC_DEFUN([SPL_AC_PAX_KERNEL_FILE_FALLOCATE], [ + AC_MSG_CHECKING([whether fops->fallocate() exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + long (*fallocate) (struct file *, int, loff_t, loff_t) = NULL; + struct file_operations_no_const fops __attribute__ ((unused)) = { + .fallocate = fallocate, + }; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_FALLOCATE, 1, [fops->fallocate() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-group-info.m4 b/config/kernel-group-info.m4 new file mode 100644 index 000000000..4db2bba5c --- /dev/null +++ b/config/kernel-group-info.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 4.9 API change +dnl # group_info changed from 2d array via >blocks to 1d array via ->gid +dnl # +AC_DEFUN([SPL_AC_GROUP_INFO_GID], [ + AC_MSG_CHECKING([whether group_info->gid exists]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/cred.h> + ],[ + struct group_info *gi = groups_alloc(1); + gi->gid[0] = KGIDT_INIT(0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GROUP_INFO_GID, 1, [group_info->gid exists]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-inode-lock.m4 b/config/kernel-inode-lock.m4 new file mode 100644 index 000000000..2cc06a5ec --- /dev/null +++ b/config/kernel-inode-lock.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 4.7 API change +dnl # i_mutex is changed to i_rwsem. Instead of directly using +dnl # i_mutex/i_rwsem, we should use inode_lock() and inode_lock_shared() +dnl # We test inode_lock_shared because inode_lock is introduced earlier. +dnl # +AC_DEFUN([SPL_AC_INODE_LOCK], [ + AC_MSG_CHECKING([whether inode_lock_shared() exists]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + struct inode *inode = NULL; + inode_lock_shared(inode); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_LOCK_SHARED, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-kmem-cache.m4 b/config/kernel-kmem-cache.m4 new file mode 100644 index 000000000..50a7fdb4b --- /dev/null +++ b/config/kernel-kmem-cache.m4 @@ -0,0 +1,72 @@ +dnl # +dnl # 2.6.35 API change, +dnl # The cachep->gfpflags member was renamed cachep->allocflags. These are +dnl # private allocation flags which are applied when allocating a new slab +dnl # in kmem_getpages(). Unfortunately there is no public API for setting +dnl # non-default flags. +dnl # +AC_DEFUN([SPL_AC_KMEM_CACHE_ALLOCFLAGS], [ + AC_MSG_CHECKING([whether struct kmem_cache has allocflags]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/slab.h> + ],[ + struct kmem_cache cachep __attribute__ ((unused)); + cachep.allocflags = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_ALLOCFLAGS, 1, + [struct kmem_cache has allocflags]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether struct kmem_cache has gfpflags]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/slab.h> + ],[ + struct kmem_cache cachep __attribute__ ((unused)); + cachep.gfpflags = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_GFPFLAGS, 1, + [struct kmem_cache has gfpflags]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) +]) + +dnl # +dnl # grsecurity API change, +dnl # kmem_cache_create() with SLAB_USERCOPY flag replaced by +dnl # kmem_cache_create_usercopy(). +dnl # +AC_DEFUN([SPL_AC_KMEM_CACHE_CREATE_USERCOPY], [ + AC_MSG_CHECKING([whether kmem_cache_create_usercopy() exists]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/slab.h> + static void ctor(void *foo) + { + // fake ctor + } + ],[ + struct kmem_cache *skc_linux_cache; + const char *name = "test"; + size_t size = 4096; + size_t align = 8; + unsigned long flags = 0; + size_t useroffset = 0; + size_t usersize = size - useroffset; + + skc_linux_cache = kmem_cache_create_usercopy( + name, size, align, flags, useroffset, usersize, ctor); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_CREATE_USERCOPY, 1, + [kmem_cache_create_usercopy() exists]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-kmem.m4 b/config/kernel-kmem.m4 new file mode 100644 index 000000000..cc055e530 --- /dev/null +++ b/config/kernel-kmem.m4 @@ -0,0 +1,58 @@ +dnl # +dnl # Enabled by default it provides a minimal level of memory tracking. +dnl # A total count of bytes allocated is kept for each alloc and free. +dnl # Then at module unload time a report to the console will be printed +dnl # if memory was leaked. +dnl # +AC_DEFUN([SPL_AC_DEBUG_KMEM], [ + AC_ARG_ENABLE([debug-kmem], + [AS_HELP_STRING([--enable-debug-kmem], + [Enable basic kmem accounting @<:@default=no@:>@])], + [], + [enable_debug_kmem=no]) + + AS_IF([test "x$enable_debug_kmem" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM" + DEBUG_KMEM="_with_debug_kmem" + AC_DEFINE([DEBUG_KMEM], [1], + [Define to 1 to enable basic kmem accounting]) + ], [ + DEBUG_KMEM="_without_debug_kmem" + ]) + + AC_SUBST(DEBUG_KMEM) + AC_MSG_CHECKING([whether basic kmem accounting is enabled]) + AC_MSG_RESULT([$enable_debug_kmem]) +]) + +dnl # +dnl # Disabled by default it provides detailed memory tracking. This +dnl # feature also requires --enable-debug-kmem to be set. When enabled +dnl # not only will total bytes be tracked but also the location of every +dnl # alloc and free. When the SPL module is unloaded a list of all leaked +dnl # addresses and where they were allocated will be dumped to the console. +dnl # Enabling this feature has a significant impact on performance but it +dnl # makes finding memory leaks pretty straight forward. +dnl # +AC_DEFUN([SPL_AC_DEBUG_KMEM_TRACKING], [ + AC_ARG_ENABLE([debug-kmem-tracking], + [AS_HELP_STRING([--enable-debug-kmem-tracking], + [Enable detailed kmem tracking @<:@default=no@:>@])], + [], + [enable_debug_kmem_tracking=no]) + + AS_IF([test "x$enable_debug_kmem_tracking" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM_TRACKING" + DEBUG_KMEM_TRACKING="_with_debug_kmem_tracking" + AC_DEFINE([DEBUG_KMEM_TRACKING], [1], + [Define to 1 to enable detailed kmem tracking]) + ], [ + DEBUG_KMEM_TRACKING="_without_debug_kmem_tracking" + ]) + + AC_SUBST(DEBUG_KMEM_TRACKING) + AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) + AC_MSG_RESULT([$enable_debug_kmem_tracking]) +]) diff --git a/config/kernel-kuidgid.m4 b/config/kernel-kuidgid.m4 new file mode 100644 index 000000000..47d193783 --- /dev/null +++ b/config/kernel-kuidgid.m4 @@ -0,0 +1,28 @@ +dnl # +dnl # User namespaces, use kuid_t in place of uid_t +dnl # where available. Not strictly a user namespaces thing +dnl # but it should prevent surprises +dnl # +AC_DEFUN([SPL_AC_KUIDGID_T], [ + AC_MSG_CHECKING([whether kuid_t/kgid_t is available]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/uidgid.h> + ], [ + kuid_t userid = KUIDT_INIT(0); + kgid_t groupid = KGIDT_INIT(0); + ],[ + SPL_LINUX_TRY_COMPILE([ + #include <linux/uidgid.h> + ], [ + kuid_t userid = 0; + kgid_t groupid = 0; + ],[ + AC_MSG_RESULT(yes; optional) + ],[ + AC_MSG_RESULT(yes; mandatory) + AC_DEFINE(HAVE_KUIDGID_T, 1, [kuid_t/kgid_t in use]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-pde-data.m4 b/config/kernel-pde-data.m4 new file mode 100644 index 000000000..6aa5765c3 --- /dev/null +++ b/config/kernel-pde-data.m4 @@ -0,0 +1,17 @@ +dnl # +dnl # 3.10 API change, +dnl # PDE is replaced by PDE_DATA +dnl # +AC_DEFUN([SPL_AC_PDE_DATA], [ + AC_MSG_CHECKING([whether PDE_DATA() is available]) + SPL_LINUX_TRY_COMPILE_SYMBOL([ + #include <linux/proc_fs.h> + ], [ + PDE_DATA(NULL); + ], [PDE_DATA], [], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PDE_DATA, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-rw.m4 b/config/kernel-rw.m4 new file mode 100644 index 000000000..23c14b70f --- /dev/null +++ b/config/kernel-rw.m4 @@ -0,0 +1,57 @@ +dnl # +dnl # 4.14 API change +dnl # kernel_write() which was introduced in 3.9 was updated to take +dnl # the offset as a pointer which is needed by vn_rdwr(). +dnl # +AC_DEFUN([SPL_AC_KERNEL_WRITE], [ + AC_MSG_CHECKING([whether kernel_write() takes loff_t pointer]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + struct file *file = NULL; + const void *buf = NULL; + size_t count = 0; + loff_t *pos = NULL; + ssize_t ret; + + ret = kernel_write(file, buf, count, pos); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_WRITE_PPOS, 1, + [kernel_write() take loff_t pointer]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 4.14 API change +dnl # kernel_read() which has existed for forever was updated to take +dnl # the offset as a pointer which is needed by vn_rdwr(). +dnl # +AC_DEFUN([SPL_AC_KERNEL_READ], [ + AC_MSG_CHECKING([whether kernel_read() takes loff_t pointer]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + struct file *file = NULL; + void *buf = NULL; + size_t count = 0; + loff_t *pos = NULL; + ssize_t ret; + + ret = kernel_read(file, buf, count, pos); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_READ_PPOS, 1, + [kernel_read() take loff_t pointer]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-rwsem.m4 b/config/kernel-rwsem.m4 new file mode 100644 index 000000000..aee20ae90 --- /dev/null +++ b/config/kernel-rwsem.m4 @@ -0,0 +1,75 @@ +dnl # +dnl # 3.1 API Change +dnl # +dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to +dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1. +dnl # +AC_DEFUN([SPL_AC_RWSEM_SPINLOCK_IS_RAW], [ + AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/rwsem.h> + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + raw_spinlock_t dummy_lock __attribute__ ((unused)) = + __RAW_SPIN_LOCK_INITIALIZER(dummy_lock); + dummy_semaphore.wait_lock = dummy_lock; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(RWSEM_SPINLOCK_IS_RAW, 1, + [struct rw_semaphore member wait_lock is raw_spinlock_t]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 3.16 API Change +dnl # +dnl # rwsem-spinlock "->activity" changed to "->count" +dnl # +AC_DEFUN([SPL_AC_RWSEM_ACTIVITY], [ + AC_MSG_CHECKING([whether struct rw_semaphore has member activity]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/rwsem.h> + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + dummy_semaphore.activity = 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1, + [struct rw_semaphore has member activity]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 4.8 API Change +dnl # +dnl # rwsem "->count" changed to atomic_long_t type +dnl # +AC_DEFUN([SPL_AC_RWSEM_ATOMIC_LONG_COUNT], [ + AC_MSG_CHECKING( + [whether struct rw_semaphore has atomic_long_t member count]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/rwsem.h> + ],[ + DECLARE_RWSEM(dummy_semaphore); + (void) atomic_long_read(&dummy_semaphore.count); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ATOMIC_LONG_COUNT, 1, + [struct rw_semaphore has atomic_long_t member count]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-sched.m4 b/config/kernel-sched.m4 new file mode 100644 index 000000000..5ae21676e --- /dev/null +++ b/config/kernel-sched.m4 @@ -0,0 +1,56 @@ +dnl # +dnl # 3.9 API change, +dnl # Moved things from linux/sched.h to linux/sched/rt.h +dnl # +AC_DEFUN([SPL_AC_SCHED_RT_HEADER], + [AC_MSG_CHECKING([whether header linux/sched/rt.h exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/sched.h> + #include <linux/sched/rt.h> + ],[ + return 0; + ],[ + AC_DEFINE(HAVE_SCHED_RT_HEADER, 1, [linux/sched/rt.h exists]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.11 API change, +dnl # Moved things from linux/sched.h to linux/sched/signal.h +dnl # +AC_DEFUN([SPL_AC_SCHED_SIGNAL_HEADER], + [AC_MSG_CHECKING([whether header linux/sched/signal.h exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/sched.h> + #include <linux/sched/signal.h> + ],[ + return 0; + ],[ + AC_DEFINE(HAVE_SCHED_SIGNAL_HEADER, 1, [linux/sched/signal.h exists]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +]) +dnl # +dnl # 3.19 API change +dnl # The io_schedule_timeout() function is present in all 2.6.32 kernels +dnl # but it was not exported until Linux 3.19. The RHEL 7.x kernels which +dnl # are based on a 3.10 kernel do export this symbol. +dnl # +AC_DEFUN([SPL_AC_IO_SCHEDULE_TIMEOUT], [ + AC_MSG_CHECKING([whether io_schedule_timeout() is available]) + SPL_LINUX_TRY_COMPILE_SYMBOL([ + #include <linux/sched.h> + ], [ + (void) io_schedule_timeout(1); + ], [io_schedule_timeout], [], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IO_SCHEDULE_TIMEOUT, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-set-fs-pwd.m4 b/config/kernel-set-fs-pwd.m4 new file mode 100644 index 000000000..849e7e6cb --- /dev/null +++ b/config/kernel-set-fs-pwd.m4 @@ -0,0 +1,39 @@ +dnl # +dnl # 3.9 API change +dnl # set_fs_pwd takes const struct path * +dnl # +AC_DEFUN([SPL_AC_SET_FS_PWD_WITH_CONST], + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + [AC_MSG_CHECKING([whether set_fs_pwd() requires const struct path *]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/spinlock.h> + #include <linux/fs_struct.h> + #include <linux/path.h> + void (*const set_fs_pwd_func) + (struct fs_struct *, const struct path *) + = set_fs_pwd; + ],[ + return 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_FS_PWD_WITH_CONST, 1, + [set_fs_pwd() needs const path *]) + ],[ + SPL_LINUX_TRY_COMPILE([ + #include <linux/spinlock.h> + #include <linux/fs_struct.h> + #include <linux/path.h> + void (*const set_fs_pwd_func) + (struct fs_struct *, struct path *) + = set_fs_pwd; + ],[ + return 0; + ],[ + AC_MSG_RESULT(no) + ],[ + AC_MSG_ERROR(unknown) + ]) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-shrinker.m4 b/config/kernel-shrinker.m4 new file mode 100644 index 000000000..6fc9b5422 --- /dev/null +++ b/config/kernel-shrinker.m4 @@ -0,0 +1,125 @@ +AC_DEFUN([SPL_AC_SHRINKER_CALLBACK],[ + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + dnl # + dnl # 2.6.23 to 2.6.34 API change + dnl # ->shrink(int nr_to_scan, gfp_t gfp_mask) + dnl # + AC_MSG_CHECKING([whether old 2-argument shrinker exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/mm.h> + + int shrinker_cb(int nr_to_scan, gfp_t gfp_mask); + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_OLD_SHRINKER_CALLBACK, 1, + [old shrinker callback wants 2 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # 2.6.35 - 2.6.39 API change + dnl # ->shrink(struct shrinker *, + dnl # int nr_to_scan, gfp_t gfp_mask) + dnl # + AC_MSG_CHECKING([whether old 3-argument shrinker exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/mm.h> + + int shrinker_cb(struct shrinker *, int nr_to_scan, + gfp_t gfp_mask); + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_3ARGS_SHRINKER_CALLBACK, 1, + [old shrinker callback wants 3 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # 3.0 - 3.11 API change + dnl # ->shrink(struct shrinker *, + dnl # struct shrink_control *sc) + dnl # + AC_MSG_CHECKING( + [whether new 2-argument shrinker exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/mm.h> + + int shrinker_cb(struct shrinker *, + struct shrink_control *sc); + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_NEW_SHRINKER_CALLBACK, 1, + [new shrinker callback wants 2 args]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # 3.12 API change, + dnl # ->shrink() is logically split in to + dnl # ->count_objects() and ->scan_objects() + dnl # + AC_MSG_CHECKING( + [whether ->count_objects callback exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/mm.h> + + unsigned long shrinker_cb( + struct shrinker *, + struct shrink_control *sc); + ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, + 1, [->count_objects exists]) + ],[ + AC_MSG_ERROR(error) + ]) + ]) + ]) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + +dnl # +dnl # 2.6.39 API change, +dnl # Shrinker adjust to use common shrink_control structure. +dnl # +AC_DEFUN([SPL_AC_SHRINK_CONTROL_STRUCT], [ + AC_MSG_CHECKING([whether struct shrink_control exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/mm.h> + ],[ + struct shrink_control sc __attribute__ ((unused)); + + sc.nr_to_scan = 0; + sc.gfp_mask = GFP_KERNEL; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1, + [struct shrink_control exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-spinlock.m4 b/config/kernel-spinlock.m4 new file mode 100644 index 000000000..136262d0e --- /dev/null +++ b/config/kernel-spinlock.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 2.6.36 API change, +dnl # The 'struct fs_struct->lock' was changed from a rwlock_t to +dnl # a spinlock_t to improve the fastpath performance. +dnl # +AC_DEFUN([SPL_AC_FS_STRUCT_SPINLOCK], [ + AC_MSG_CHECKING([whether struct fs_struct uses spinlock_t]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/sched.h> + #include <linux/fs_struct.h> + ],[ + static struct fs_struct fs; + spin_lock_init(&fs.lock); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FS_STRUCT_SPINLOCK, 1, + [struct fs_struct uses spinlock_t]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4 new file mode 100644 index 000000000..93b5158b9 --- /dev/null +++ b/config/kernel-timer.m4 @@ -0,0 +1,32 @@ +dnl # +dnl # 4.15 API change +dnl # https://lkml.org/lkml/2017/11/25/90 +dnl # Check if timer_list.func get passed a timer_list or an unsigned long +dnl # (older kernels). Also sanity check the from_timer() and timer_setup() +dnl # macros are available as well, since they will be used in the same newer +dnl # kernels that support the new timer_list.func signature. +dnl # +AC_DEFUN([SPL_AC_KERNEL_TIMER_FUNCTION_TIMER_LIST], [ + AC_MSG_CHECKING([whether timer_list.function gets a timer_list]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include <linux/timer.h> + void task_expire(struct timer_list *tl) {} + ],[ + #ifndef from_timer + #error "No from_timer() macro" + #endif + + struct timer_list timer; + timer.function = task_expire; + timer_setup(&timer, NULL, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1, + [timer_list.function gets a timer_list]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) diff --git a/config/kernel-trim-unused-symbols.m4 b/config/kernel-trim-unused-symbols.m4 new file mode 100644 index 000000000..d1ac2f3c8 --- /dev/null +++ b/config/kernel-trim-unused-symbols.m4 @@ -0,0 +1,19 @@ +dnl # +dnl # config trim unused symbols, +dnl # Verify the kernel has CONFIG_TRIM_UNUSED_KSYMS DISABLED. +dnl # +AC_DEFUN([SPL_AC_CONFIG_TRIM_UNUSED_KSYMS], [ + AC_MSG_CHECKING([whether CONFIG_TRIM_UNUSED_KSYM is disabled]) + SPL_LINUX_TRY_COMPILE([ + #if defined(CONFIG_TRIM_UNUSED_KSYMS) + #error CONFIG_TRIM_UNUSED_KSYMS not defined + #endif + ],[ ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel has unused symbols trimming enabled, please disable. + *** Rebuild the kernel with CONFIG_TRIM_UNUSED_KSYMS=n set.]) + ]) +]) diff --git a/config/kernel-urange-sleep.m4 b/config/kernel-urange-sleep.m4 new file mode 100644 index 000000000..85beca6dd --- /dev/null +++ b/config/kernel-urange-sleep.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 2.6.36 API compatibility. +dnl # Added usleep_range timer. +dnl # usleep_range is a finer precision implementation of msleep +dnl # designed to be a drop-in replacement for udelay where a precise +dnl # sleep / busy-wait is unnecessary. +dnl # +AC_DEFUN([SPL_AC_USLEEP_RANGE], [ + AC_MSG_CHECKING([whether usleep_range() is available]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/delay.h> + ],[ + usleep_range(0, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_USLEEP_RANGE, 1, + [usleep_range is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-vfs-fsync.m4 b/config/kernel-vfs-fsync.m4 new file mode 100644 index 000000000..3c42bf1a0 --- /dev/null +++ b/config/kernel-vfs-fsync.m4 @@ -0,0 +1,17 @@ +dnl # +dnl # 2.6.35 API change, +dnl # Unused 'struct dentry *' removed from vfs_fsync() prototype. +dnl # +AC_DEFUN([SPL_AC_2ARGS_VFS_FSYNC], [ + AC_MSG_CHECKING([whether vfs_fsync() wants 2 args]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + vfs_fsync(NULL, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_VFS_FSYNC, 1, [vfs_fsync() wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-vfs-getattr.m4 b/config/kernel-vfs-getattr.m4 new file mode 100644 index 000000000..7772cb514 --- /dev/null +++ b/config/kernel-vfs-getattr.m4 @@ -0,0 +1,62 @@ +dnl # +dnl # 4.11 API, a528d35e@torvalds/linux +dnl # vfs_getattr(const struct path *p, struct kstat *s, u32 m, unsigned int f) +dnl # +AC_DEFUN([SPL_AC_4ARGS_VFS_GETATTR], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 4 args]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + vfs_getattr((const struct path *)NULL, + (struct kstat *)NULL, + (u32)0, + (unsigned int)0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_4ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 4 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.9 API +dnl # vfs_getattr(struct path *p, struct kstat *s) +dnl # +AC_DEFUN([SPL_AC_2ARGS_VFS_GETATTR], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 2 args]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + vfs_getattr((struct path *) NULL, + (struct kstat *)NULL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # <3.9 API +dnl # vfs_getattr(struct vfsmount *v, struct dentry *d, struct kstat *k) +dnl # +AC_DEFUN([SPL_AC_3ARGS_VFS_GETATTR], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 3 args]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/fs.h> + ],[ + vfs_getattr((struct vfsmount *)NULL, + (struct dentry *)NULL, + (struct kstat *)NULL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_3ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 3 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-wait.m4 b/config/kernel-wait.m4 new file mode 100644 index 000000000..5f718a160 --- /dev/null +++ b/config/kernel-wait.m4 @@ -0,0 +1,76 @@ +dnl # +dnl # 3.17 API change, +dnl # wait_on_bit() no longer requires an action argument. The former +dnl # "wait_on_bit" interface required an 'action' function to be provided +dnl # which does the actual waiting. There were over 20 such functions in the +dnl # kernel, many of them identical, though most cases can be satisfied by one +dnl # of just two functions: one which uses io_schedule() and one which just +dnl # uses schedule(). This API change was made to consolidate all of those +dnl # redundant wait functions. +dnl # +AC_DEFUN([SPL_AC_WAIT_ON_BIT], [ + AC_MSG_CHECKING([whether wait_on_bit() takes an action]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/wait.h> + ],[ + int (*action)(void *) = NULL; + wait_on_bit(NULL, 0, action, 0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) +dnl # +dnl # 4.13 API change +dnl # Renamed struct wait_queue -> struct wait_queue_entry. +dnl # +AC_DEFUN([SPL_AC_WAIT_QUEUE_ENTRY_T], [ + AC_MSG_CHECKING([whether wait_queue_entry_t exists]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/wait.h> + ],[ + wait_queue_entry_t *entry __attribute__ ((unused)); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_QUEUE_ENTRY_T, 1, + [wait_queue_entry_t exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.13 API change +dnl # Renamed wait_queue_head::task_list -> wait_queue_head::head +dnl # Renamed wait_queue_entry::task_list -> wait_queue_entry::entry +dnl # +AC_DEFUN([SPL_AC_WAIT_QUEUE_HEAD_ENTRY], [ + AC_MSG_CHECKING([whether wq_head->head and wq_entry->entry exist]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/wait.h> + + #ifdef HAVE_WAIT_QUEUE_ENTRY_T + typedef wait_queue_head_t spl_wait_queue_head_t; + typedef wait_queue_entry_t spl_wait_queue_entry_t; + #else + typedef wait_queue_head_t spl_wait_queue_head_t; + typedef wait_queue_t spl_wait_queue_entry_t; + #endif + ],[ + spl_wait_queue_head_t wq_head; + spl_wait_queue_entry_t wq_entry; + struct list_head *head __attribute__ ((unused)); + struct list_head *entry __attribute__ ((unused)); + + head = &wq_head.head; + entry = &wq_entry.entry; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_QUEUE_HEAD_ENTRY, 1, + [wq_head->head and wq_entry->entry exist]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-zlib.m4 b/config/kernel-zlib.m4 new file mode 100644 index 000000000..bb236466a --- /dev/null +++ b/config/kernel-zlib.m4 @@ -0,0 +1,63 @@ +dnl # +dnl # zlib inflate compat, +dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled. +dnl # +AC_DEFUN([SPL_AC_CONFIG_ZLIB_INFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined]) + SPL_LINUX_TRY_COMPILE([ + #if !defined(CONFIG_ZLIB_INFLATE) && \ + !defined(CONFIG_ZLIB_INFLATE_MODULE) + #error CONFIG_ZLIB_INFLATE not defined + #endif + ],[ ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib inflate support. + *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.]) + ]) +]) + +dnl # +dnl # zlib deflate compat, +dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled. +dnl # +AC_DEFUN([SPL_AC_CONFIG_ZLIB_DEFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined]) + SPL_LINUX_TRY_COMPILE([ + #if !defined(CONFIG_ZLIB_DEFLATE) && \ + !defined(CONFIG_ZLIB_DEFLATE_MODULE) + #error CONFIG_ZLIB_DEFLATE not defined + #endif + ],[ ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib deflate support. + *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.]) + ]) +]) + +dnl # +dnl # 2.6.39 API compat, +dnl # The function zlib_deflate_workspacesize() now take 2 arguments. +dnl # This was done to avoid always having to allocate the maximum size +dnl # workspace (268K). The caller can now specific the windowBits and +dnl # memLevel compression parameters to get a smaller workspace. +dnl # +AC_DEFUN([SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], + [AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args]) + SPL_LINUX_TRY_COMPILE([ + #include <linux/zlib.h> + ],[ + return zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1, + [zlib_deflate_workspacesize() wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/include/spl/rpc/xdr.h b/include/spl/rpc/xdr.h new file mode 100644 index 000000000..0b39b46cf --- /dev/null +++ b/include/spl/rpc/xdr.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2008 Sun Microsystems, Inc. + * Written by Ricardo Correia <[email protected]> + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_RPC_XDR_H +#define _SPL_RPC_XDR_H + +#include <sys/types.h> + +typedef int bool_t; + +/* + * XDR enums and types. + */ +enum xdr_op { + XDR_ENCODE, + XDR_DECODE +}; + +struct xdr_ops; + +typedef struct { + struct xdr_ops *x_ops; /* Let caller know xdrmem_create() succeeds */ + caddr_t x_addr; /* Current buffer addr */ + caddr_t x_addr_end; /* End of the buffer */ + enum xdr_op x_op; /* Stream direction */ +} XDR; + +typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr); + +struct xdr_ops { + bool_t (*xdr_control)(XDR *, int, void *); + + bool_t (*xdr_char)(XDR *, char *); + bool_t (*xdr_u_short)(XDR *, unsigned short *); + bool_t (*xdr_u_int)(XDR *, unsigned *); + bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *); + + bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t); + bool_t (*xdr_string)(XDR *, char **, const uint_t); + bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t, + const uint_t, const xdrproc_t); +}; + +/* + * XDR control operator. + */ +#define XDR_GET_BYTES_AVAIL 1 + +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; + +/* + * XDR functions. + */ +void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op); + +/* Currently not needed. If needed later, we'll add it to struct xdr_ops */ +#define xdr_destroy(xdrs) ((void) 0) + +#define xdr_control(xdrs, req, info) \ + (xdrs)->x_ops->xdr_control((xdrs), (req), (info)) + +/* + * For precaution, the following are defined as static inlines instead of macros + * to get some amount of type safety. + * + * Also, macros wouldn't work in the case where typecasting is done, because it + * must be possible to reference the functions' addresses by these names. + */ +static inline bool_t xdr_char(XDR *xdrs, char *cp) +{ + return (xdrs->x_ops->xdr_char(xdrs, cp)); +} + +static inline bool_t xdr_u_short(XDR *xdrs, unsigned short *usp) +{ + return (xdrs->x_ops->xdr_u_short(xdrs, usp)); +} + +static inline bool_t xdr_short(XDR *xdrs, short *sp) +{ + BUILD_BUG_ON(sizeof (short) != 2); + return (xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp)); +} + +static inline bool_t xdr_u_int(XDR *xdrs, unsigned *up) +{ + return (xdrs->x_ops->xdr_u_int(xdrs, up)); +} + +static inline bool_t xdr_int(XDR *xdrs, int *ip) +{ + BUILD_BUG_ON(sizeof (int) != 4); + return (xdrs->x_ops->xdr_u_int(xdrs, (unsigned *)ip)); +} + +static inline bool_t xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp) +{ + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp)); +} + +static inline bool_t xdr_longlong_t(XDR *xdrs, longlong_t *llp) +{ + BUILD_BUG_ON(sizeof (longlong_t) != 8); + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *)llp)); +} + +/* + * Fixed-length opaque data. + */ +static inline bool_t xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + return (xdrs->x_ops->xdr_opaque(xdrs, cp, cnt)); +} + +/* + * Variable-length string. + * The *sp buffer must have (maxsize + 1) bytes. + */ +static inline bool_t xdr_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + return (xdrs->x_ops->xdr_string(xdrs, sp, maxsize)); +} + +/* + * Variable-length arrays. + */ +static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, + const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc) +{ + return xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize, + elproc); +} + +#endif /* SPL_RPC_XDR_H */ diff --git a/include/spl/sys/acl.h b/include/spl/sys/acl.h new file mode 100644 index 000000000..9fc79c025 --- /dev/null +++ b/include/spl/sys/acl.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_ACL_H +#define _SPL_ACL_H + +#include <sys/types.h> + +typedef struct ace { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; +} ace_t; + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define MAX_ACL_ENTRIES 1024 + +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) + +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP) + +/* BEGIN CSTYLED */ +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) +/* END CSTYLED */ + +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 + +#endif /* _SPL_ACL_H */ diff --git a/include/spl/sys/atomic.h b/include/spl/sys/atomic.h new file mode 100644 index 000000000..51b547923 --- /dev/null +++ b/include/spl/sys/atomic.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_ATOMIC_H +#define _SPL_ATOMIC_H + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <sys/types.h> + +/* + * Map the atomic_* functions to the Linux counterparts. This relies on the + * fact that the atomic types are internally really a uint32 or uint64. If + * this were to change an alternate approach would be needed. + * + * N.B. Due to the limitations of the original API atomicity is not strictly + * preserved when using the 64-bit functions on a 32-bit system. In order + * to support this all consumers would need to be updated to use the Linux + * provided atomic_t and atomic64_t types. + */ +#define atomic_inc_32(v) atomic_inc((atomic_t *)(v)) +#define atomic_dec_32(v) atomic_dec((atomic_t *)(v)) +#define atomic_add_32(v, i) atomic_add((i), (atomic_t *)(v)) +#define atomic_sub_32(v, i) atomic_sub((i), (atomic_t *)(v)) +#define atomic_inc_32_nv(v) atomic_inc_return((atomic_t *)(v)) +#define atomic_dec_32_nv(v) atomic_dec_return((atomic_t *)(v)) +#define atomic_add_32_nv(v, i) atomic_add_return((i), (atomic_t *)(v)) +#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v)) +#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y) +#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x) +#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v)) +#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v)) +#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v)) +#define atomic_sub_64(v, i) atomic64_sub((i), (atomic64_t *)(v)) +#define atomic_inc_64_nv(v) atomic64_inc_return((atomic64_t *)(v)) +#define atomic_dec_64_nv(v) atomic64_dec_return((atomic64_t *)(v)) +#define atomic_add_64_nv(v, i) atomic64_add_return((i), (atomic64_t *)(v)) +#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v)) +#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y) +#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x) + +#ifdef _LP64 +static __inline__ void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_64((volatile uint64_t *)target, + (uint64_t)cmp, (uint64_t)newval)); +} +#else /* _LP64 */ +static __inline__ void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_32((volatile uint32_t *)target, + (uint32_t)cmp, (uint32_t)newval)); +} +#endif /* _LP64 */ + +#endif /* _SPL_ATOMIC_H */ diff --git a/include/spl/sys/byteorder.h b/include/spl/sys/byteorder.h new file mode 100644 index 000000000..477707996 --- /dev/null +++ b/include/spl/sys/byteorder.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_BYTEORDER_H +#define _SPL_BYTEORDER_H + +#include <asm/byteorder.h> +#include <sys/isa_defs.h> + +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define LE_16(x) cpu_to_le16(x) +#define LE_32(x) cpu_to_le32(x) +#define LE_64(x) cpu_to_le64(x) +#define BE_16(x) cpu_to_be16(x) +#define BE_32(x) cpu_to_be32(x) +#define BE_64(x) cpu_to_be64(x) + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#ifdef _BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) +{ + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + +#endif /* SPL_BYTEORDER_H */ diff --git a/include/spl/sys/callb.h b/include/spl/sys/callb.h new file mode 100644 index 000000000..f1826bfd3 --- /dev/null +++ b/include/spl/sys/callb.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CALLB_H +#define _SPL_CALLB_H + +#include <linux/module.h> +#include <sys/mutex.h> + +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + +#endif /* _SPL_CALLB_H */ diff --git a/include/spl/sys/callo.h b/include/spl/sys/callo.h new file mode 100644 index 000000000..c43ac92e7 --- /dev/null +++ b/include/spl/sys/callo.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CALLO_H +#define _SPL_CALLO_H + +/* + * Callout flags: + * + * CALLOUT_FLAG_ROUNDUP + * Roundup the expiration time to the next resolution boundary. + * If this flag is not specified, the expiration time is rounded down. + * CALLOUT_FLAG_ABSOLUTE + * Normally, the expiration passed to the timeout API functions is an + * expiration interval. If this flag is specified, then it is + * interpreted as the expiration time itself. + * CALLOUT_FLAG_HRESTIME + * Normally, callouts are not affected by changes to system time + * (hrestime). This flag is used to create a callout that is affected + * by system time. If system time changes, these timers must be + * handled in a special way (see callout.c). These are used by condition + * variables and LWP timers that need this behavior. + * CALLOUT_FLAG_32BIT + * Legacy interfaces timeout() and realtime_timeout() pass this flag + * to timeout_generic() to indicate that a 32-bit ID should be allocated. + */ +#define CALLOUT_FLAG_ROUNDUP 0x1 +#define CALLOUT_FLAG_ABSOLUTE 0x2 +#define CALLOUT_FLAG_HRESTIME 0x4 +#define CALLOUT_FLAG_32BIT 0x8 + +#endif /* _SPL_CALLB_H */ diff --git a/include/spl/sys/cmn_err.h b/include/spl/sys/cmn_err.h new file mode 100644 index 000000000..be57358b0 --- /dev/null +++ b/include/spl/sys/cmn_err.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CMN_ERR_H +#define _SPL_CMN_ERR_H + +#include <stdarg.h> + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +extern void cmn_err(int, const char *, ...); +extern void vcmn_err(int, const char *, va_list); +extern void vpanic(const char *, va_list); + +#define fm_panic panic + +#endif /* SPL_CMN_ERR_H */ diff --git a/include/spl/sys/condvar.h b/include/spl/sys/condvar.h new file mode 100644 index 000000000..1d47cdd96 --- /dev/null +++ b/include/spl/sys/condvar.h @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CONDVAR_H +#define _SPL_CONDVAR_H + +#include <linux/module.h> +#include <sys/kmem.h> +#include <sys/mutex.h> +#include <sys/callo.h> +#include <sys/wait.h> + +/* + * The kcondvar_t struct is protected by mutex taken externally before + * calling any of the wait/signal funs, and passed into the wait funs. + */ +#define CV_MAGIC 0x346545f4 +#define CV_DESTROY 0x346545f5 + +typedef struct { + int cv_magic; + spl_wait_queue_head_t cv_event; + spl_wait_queue_head_t cv_destroy; + atomic_t cv_refs; + atomic_t cv_waiters; + kmutex_t *cv_mutex; +} kcondvar_t; + +typedef enum { CV_DEFAULT = 0, CV_DRIVER } kcv_type_t; + +extern void __cv_init(kcondvar_t *, char *, kcv_type_t, void *); +extern void __cv_destroy(kcondvar_t *); +extern void __cv_wait(kcondvar_t *, kmutex_t *); +extern void __cv_wait_io(kcondvar_t *, kmutex_t *); +extern void __cv_wait_sig(kcondvar_t *, kmutex_t *); +extern clock_t __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); +extern clock_t __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t); +extern clock_t __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); +extern clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, + hrtime_t res, int flag); +extern clock_t cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, + hrtime_t res, int flag); +extern void __cv_signal(kcondvar_t *); +extern void __cv_broadcast(kcondvar_t *c); + +#define cv_init(cvp, name, type, arg) __cv_init(cvp, name, type, arg) +#define cv_destroy(cvp) __cv_destroy(cvp) +#define cv_wait(cvp, mp) __cv_wait(cvp, mp) +#define cv_wait_io(cvp, mp) __cv_wait_io(cvp, mp) +#define cv_wait_sig(cvp, mp) __cv_wait_sig(cvp, mp) +#define cv_wait_interruptible(cvp, mp) cv_wait_sig(cvp, mp) +#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) +#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t) +#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t) +#define cv_timedwait_interruptible(cvp, mp, t) cv_timedwait_sig(cvp, mp, t) +#define cv_signal(cvp) __cv_signal(cvp) +#define cv_broadcast(cvp) __cv_broadcast(cvp) + +#endif /* _SPL_CONDVAR_H */ diff --git a/include/spl/sys/console.h b/include/spl/sys/console.h new file mode 100644 index 000000000..3469cb762 --- /dev/null +++ b/include/spl/sys/console.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CONSOLE_H +#define _SPL_CONSOLE_H + +void +console_vprintf(const char *fmt, va_list args) +{ + vprintk(fmt, args); +} + +void +console_printf(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + console_vprintf(fmt, args); + va_end(args); +} + +#endif /* _SPL_CONSOLE_H */ diff --git a/include/spl/sys/cred.h b/include/spl/sys/cred.h new file mode 100644 index 000000000..fd063399b --- /dev/null +++ b/include/spl/sys/cred.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CRED_H +#define _SPL_CRED_H + +#include <linux/module.h> +#include <linux/cred.h> +#include <sys/types.h> +#include <sys/vfs.h> + +typedef struct cred cred_t; + +#define kcred ((cred_t *)(init_task.cred)) +#define CRED() ((cred_t *)current_cred()) + +/* Linux 4.9 API change, GROUP_AT was removed */ +#ifndef GROUP_AT +#define GROUP_AT(gi, i) ((gi)->gid[i]) +#endif + +#ifdef HAVE_KUIDGID_T + +#define KUID_TO_SUID(x) (__kuid_val(x)) +#define KGID_TO_SGID(x) (__kgid_val(x)) +#define SUID_TO_KUID(x) (KUIDT_INIT(x)) +#define SGID_TO_KGID(x) (KGIDT_INIT(x)) +#define KGIDP_TO_SGIDP(x) (&(x)->val) + +#else /* HAVE_KUIDGID_T */ + +#define KUID_TO_SUID(x) (x) +#define KGID_TO_SGID(x) (x) +#define SUID_TO_KUID(x) (x) +#define SGID_TO_KGID(x) (x) +#define KGIDP_TO_SGIDP(x) (x) + +#endif /* HAVE_KUIDGID_T */ + +extern void crhold(cred_t *cr); +extern void crfree(cred_t *cr); +extern uid_t crgetuid(const cred_t *cr); +extern uid_t crgetruid(const cred_t *cr); +extern uid_t crgetsuid(const cred_t *cr); +extern uid_t crgetfsuid(const cred_t *cr); +extern gid_t crgetgid(const cred_t *cr); +extern gid_t crgetrgid(const cred_t *cr); +extern gid_t crgetsgid(const cred_t *cr); +extern gid_t crgetfsgid(const cred_t *cr); +extern int crgetngroups(const cred_t *cr); +extern gid_t *crgetgroups(const cred_t *cr); +extern int groupmember(gid_t gid, const cred_t *cr); + +#endif /* _SPL_CRED_H */ diff --git a/include/spl/sys/ctype.h b/include/spl/sys/ctype.h new file mode 100644 index 000000000..18beb1daa --- /dev/null +++ b/include/spl/sys/ctype.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_CTYPE_H +#define _SPL_CTYPE_H + +#include <linux/ctype.h> + +#endif /* SPL_CTYPE_H */ diff --git a/include/spl/sys/debug.h b/include/spl/sys/debug.h new file mode 100644 index 000000000..a4a458066 --- /dev/null +++ b/include/spl/sys/debug.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERTV() - Wraps a variable declaration which is only used by ASSERT(). + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + +/* + * Common DEBUG functionality. + */ +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3_IMPL(LEFT, OP, RIGHT, TYPE, FMT, CAST) do { \ + TYPE _verify3_left = (TYPE)(LEFT); \ + TYPE _verify3_right = (TYPE)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (" FMT " " #OP " " FMT ")\n", \ + CAST (_verify3_left), CAST (_verify3_right)); \ + } while (0) + +#define VERIFY3B(x,y,z) VERIFY3_IMPL(x, y, z, boolean_t, "%d", (boolean_t)) +#define VERIFY3S(x,y,z) VERIFY3_IMPL(x, y, z, int64_t, "%lld", (long long)) +#define VERIFY3U(x,y,z) VERIFY3_IMPL(x, y, z, uint64_t, "%llu", \ + (unsigned long long)) +#define VERIFY3P(x,y,z) VERIFY3_IMPL(x, y, z, uintptr_t, "%p", (void *)) +#define VERIFY0(x) VERIFY3_IMPL(0, ==, x, int64_t, "%lld", (long long)) + +#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) +#define CTASSERT(x) { _CTASSERT(x, __LINE__); } +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + typedef char __attribute__ ((unused)) \ + __compile_time_assertion__ ## y[(x) ? 1 : -1] + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define SPL_DEBUG_STR "" +#define ASSERT(x) ((void)0) +#define ASSERTV(x) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define SPL_DEBUG_STR " (DEBUG mode)" +#define ASSERT(cond) VERIFY(cond) +#define ASSERTV(x) x +#define ASSERT3B(x,y,z) VERIFY3B(x, y, z) +#define ASSERT3S(x,y,z) VERIFY3S(x, y, z) +#define ASSERT3U(x,y,z) VERIFY3U(x, y, z) +#define ASSERT3P(x,y,z) VERIFY3P(x, y, z) +#define ASSERT0(x) VERIFY0(x) +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#endif /* SPL_DEBUG_H */ diff --git a/include/spl/sys/disp.h b/include/spl/sys/disp.h new file mode 100644 index 000000000..413b623c8 --- /dev/null +++ b/include/spl/sys/disp.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_DISP_H +#define _SPL_DISP_H + +#include <linux/preempt.h> + +#define kpreempt(unused) schedule() +#define kpreempt_disable() preempt_disable() +#define kpreempt_enable() preempt_enable() + +#endif /* SPL_DISP_H */ diff --git a/include/spl/sys/dkio.h b/include/spl/sys/dkio.h new file mode 100644 index 000000000..49f166a9c --- /dev/null +++ b/include/spl/sys/dkio.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_DKIO_H +#define _SPL_DKIO_H + +#define DFL_SZ(num_exts) \ + (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) + +#define DKIOC (0x04 << 8) +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +/* + * ioctl to free space (e.g. SCSI UNMAP) off a disk. + * Pass a dkioc_free_list_t containing a list of extents to be freed. + */ +#define DKIOCFREE (DKIOC|50) + +#endif /* _SPL_DKIO_H */ diff --git a/include/spl/sys/dkioc_free_util.h b/include/spl/sys/dkioc_free_util.h new file mode 100644 index 000000000..d519b2f8e --- /dev/null +++ b/include/spl/sys/dkioc_free_util.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_DKIOC_UTIL_H +#define _SPL_DKIOC_UTIL_H + +#include <sys/dkio.h> + +typedef struct dkioc_free_list_ext_s { + uint64_t dfle_start; + uint64_t dfle_length; +} dkioc_free_list_ext_t; + +typedef struct dkioc_free_list_s { + uint64_t dfl_flags; + uint64_t dfl_num_exts; + int64_t dfl_offset; + + /* + * N.B. this is only an internal debugging API! This is only called + * from debug builds of sd for pre-release checking. Remove before GA! + */ + void (*dfl_ck_func)(uint64_t, uint64_t, void *); + void *dfl_ck_arg; + + dkioc_free_list_ext_t dfl_exts[1]; +} dkioc_free_list_t; + +static inline void dfl_free(dkioc_free_list_t *dfl) { + vmem_free(dfl, DFL_SZ(dfl->dfl_num_exts)); +} + +static inline dkioc_free_list_t *dfl_alloc(uint64_t dfl_num_exts, int flags) { + return (vmem_zalloc(DFL_SZ(dfl_num_exts), flags)); +} + +#endif /* _SPL_DKIOC_UTIL_H */ diff --git a/include/spl/sys/fcntl.h b/include/spl/sys/fcntl.h new file mode 100644 index 000000000..3faa5dad7 --- /dev/null +++ b/include/spl/sys/fcntl.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_FCNTL_H +#define _SPL_FCNTL_H + +#include <asm/fcntl.h> + +#define F_FREESP 11 + +#ifdef CONFIG_64BIT +typedef struct flock flock64_t; +#else +typedef struct flock64 flock64_t; +#endif /* CONFIG_64BIT */ + +#endif /* _SPL_FCNTL_H */ diff --git a/include/spl/sys/file.h b/include/spl/sys/file.h new file mode 100644 index 000000000..05dbc0814 --- /dev/null +++ b/include/spl/sys/file.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_FILE_H +#define _SPL_FILE_H + +#define FIGNORECASE 0x00080000 +#define FKIOCTL 0x80000000 +#define ED_CASE_CONFLICT 0x10 + +#ifdef HAVE_INODE_LOCK_SHARED +#define spl_inode_lock(ip) inode_lock(ip) +#define spl_inode_unlock(ip) inode_unlock(ip) +#define spl_inode_lock_shared(ip) inode_lock_shared(ip) +#define spl_inode_unlock_shared(ip) inode_unlock_shared(ip) +#define spl_inode_trylock(ip) inode_trylock(ip) +#define spl_inode_trylock_shared(ip) inode_trylock_shared(ip) +#define spl_inode_is_locked(ip) inode_is_locked(ip) +#define spl_inode_lock_nested(ip, s) inode_lock_nested(ip, s) +#else +#define spl_inode_lock(ip) mutex_lock(&(ip)->i_mutex) +#define spl_inode_unlock(ip) mutex_unlock(&(ip)->i_mutex) +#define spl_inode_lock_shared(ip) mutex_lock(&(ip)->i_mutex) +#define spl_inode_unlock_shared(ip) mutex_unlock(&(ip)->i_mutex) +#define spl_inode_trylock(ip) mutex_trylock(&(ip)->i_mutex) +#define spl_inode_trylock_shared(ip) mutex_trylock(&(ip)->i_mutex) +#define spl_inode_is_locked(ip) mutex_is_locked(&(ip)->i_mutex) +#define spl_inode_lock_nested(ip, s) mutex_lock_nested(&(ip)->i_mutex, s) +#endif + +#endif /* SPL_FILE_H */ diff --git a/include/spl/sys/inttypes.h b/include/spl/sys/inttypes.h new file mode 100644 index 000000000..92e76206b --- /dev/null +++ b/include/spl/sys/inttypes.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_INTTYPES_H +#define _SPL_INTTYPES_H + +#endif /* SPL_INTTYPES_H */ diff --git a/include/spl/sys/isa_defs.h b/include/spl/sys/isa_defs.h new file mode 100644 index 000000000..089ae0f85 --- /dev/null +++ b/include/spl/sys/isa_defs.h @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_ISA_DEFS_H +#define _SPL_ISA_DEFS_H + +/* x86_64 arch specific defines */ +#if defined(__x86_64) || defined(__x86_64__) + +#if !defined(__x86_64) +#define __x86_64 +#endif + +#if !defined(__amd64) +#define __amd64 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if !defined(_LP64) +#define _LP64 +#endif + +#define _ALIGNMENT_REQUIRED 1 + + +/* i386 arch specific defines */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if !defined(_ILP32) +#define _ILP32 +#endif + +#define _ALIGNMENT_REQUIRED 0 + +/* powerpc (ppc64) arch specific defines */ +#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) + +#if !defined(__powerpc) +#define __powerpc +#endif + +#if !defined(__powerpc__) +#define __powerpc__ +#endif + +#if defined(__powerpc64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for PPC, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* arm arch specific defines */ +#elif defined(__arm) || defined(__arm__) || defined(__aarch64__) + +#if !defined(__arm) +#define __arm +#endif + +#if !defined(__arm__) +#define __arm__ +#endif + +#if defined(__aarch64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#if defined(__ARMEL__) || defined(__AARCH64EL__) +#define _LITTLE_ENDIAN +#else +#define _BIG_ENDIAN +#endif + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for ARM, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* sparc arch specific defines */ +#elif defined(__sparc) || defined(__sparc__) + +#if !defined(__sparc) +#define __sparc +#endif + +#if !defined(__sparc__) +#define __sparc__ +#endif + +#if defined(__arch64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#define _BIG_ENDIAN +#define _SUNOS_VTOC_16 +#define _ALIGNMENT_REQUIRED 1 + +/* s390 arch specific defines */ +#elif defined(__s390__) +#if defined(__s390x__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#define _BIG_ENDIAN + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for s390, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* MIPS arch specific defines */ +#elif defined(__mips__) + +#if defined(__MIPSEB__) +#define _BIG_ENDIAN +#elif defined(__MIPSEL__) +#define _LITTLE_ENDIAN +#else +#error MIPS no endian specified +#endif + +#ifndef _LP64 +#define _ILP32 +#endif + +#define _SUNOS_VTOC_16 + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for MIPS, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +#else +/* + * Currently supported: + * x86_64, i386, arm, powerpc, s390, sparc, and mips + */ +#error "Unsupported ISA type" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#if !defined(_ILP32) && !defined(_LP64) +#error "Neither _ILP32 or _LP64 are defined" +#endif + +#include <sys/byteorder.h> + +#if defined(__LITTLE_ENDIAN) && !defined(_LITTLE_ENDIAN) +#define _LITTLE_ENDIAN __LITTLE_ENDIAN +#endif + +#if defined(__BIG_ENDIAN) && !defined(_BIG_ENDIAN) +#define _BIG_ENDIAN __BIG_ENDIAN +#endif + +#if defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN) +#error "Both _LITTLE_ENDIAN and _BIG_ENDIAN are defined" +#endif + +#if !defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) +#error "Neither _LITTLE_ENDIAN or _BIG_ENDIAN are defined" +#endif + +#endif /* _SPL_ISA_DEFS_H */ diff --git a/include/spl/sys/kmem.h b/include/spl/sys/kmem.h new file mode 100644 index 000000000..d6b428551 --- /dev/null +++ b/include/spl/sys/kmem.h @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_KMEM_H +#define _SPL_KMEM_H + +#include <sys/debug.h> +#include <linux/slab.h> +#include <linux/sched.h> + +extern int kmem_debugging(void); +extern char *kmem_vasprintf(const char *fmt, va_list ap); +extern char *kmem_asprintf(const char *fmt, ...); +extern char *strdup(const char *str); +extern void strfree(char *str); + +/* + * Memory allocation interfaces + */ +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_ZERO 0x1000 /* zero the allocation */ +#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ + +#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) + +static int spl_fstrans_check(void); + +/* + * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion + * function is context aware which means that KM_SLEEP allocations can be + * safely used in syncing contexts which have set PF_FSTRANS. + */ +static inline gfp_t +kmem_flags_convert(int flags) +{ + gfp_t lflags = __GFP_NOWARN | __GFP_COMP; + + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC | __GFP_NORETRY; + } else { + lflags |= GFP_KERNEL; + if (spl_fstrans_check()) + lflags &= ~(__GFP_IO|__GFP_FS); + } + + if (flags & KM_PUSHPAGE) + lflags |= __GFP_HIGH; + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + + return (lflags); +} + +typedef struct { + struct task_struct *fstrans_thread; + unsigned int saved_flags; +} fstrans_cookie_t; + +/* + * Introduced in Linux 3.9, however this cannot be solely relied on before + * Linux 3.18 as it doesn't turn off __GFP_FS as it should. + */ +#ifdef PF_MEMALLOC_NOIO +#define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO) +#else +#define __SPL_PF_MEMALLOC_NOIO (0) +#endif + +/* + * PF_FSTRANS is removed from Linux 4.12 + */ +#ifdef PF_FSTRANS +#define __SPL_PF_FSTRANS (PF_FSTRANS) +#else +#define __SPL_PF_FSTRANS (0) +#endif + +#define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO) + +static inline fstrans_cookie_t +spl_fstrans_mark(void) +{ + fstrans_cookie_t cookie; + + BUILD_BUG_ON(SPL_FSTRANS == 0); + + cookie.fstrans_thread = current; + cookie.saved_flags = current->flags & SPL_FSTRANS; + current->flags |= SPL_FSTRANS; + + return (cookie); +} + +static inline void +spl_fstrans_unmark(fstrans_cookie_t cookie) +{ + ASSERT3P(cookie.fstrans_thread, ==, current); + ASSERT((current->flags & SPL_FSTRANS) == SPL_FSTRANS); + + current->flags &= ~SPL_FSTRANS; + current->flags |= cookie.saved_flags; +} + +static inline int +spl_fstrans_check(void) +{ + return (current->flags & SPL_FSTRANS); +} + +/* + * specifically used to check PF_FSTRANS flag, cannot be relied on for + * checking spl_fstrans_mark(). + */ +static inline int +__spl_pf_fstrans_check(void) +{ + return (current->flags & __SPL_PF_FSTRANS); +} + +#ifdef HAVE_ATOMIC64_T +#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) +extern atomic64_t kmem_alloc_used; +extern unsigned long long kmem_alloc_max; +#else /* HAVE_ATOMIC64_T */ +#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) +extern atomic_t kmem_alloc_used; +extern unsigned long long kmem_alloc_max; +#endif /* HAVE_ATOMIC64_T */ + +extern unsigned int spl_kmem_alloc_warn; +extern unsigned int spl_kmem_alloc_max; + +#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) +#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) +#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) + +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_kmem_free(const void *ptr, size_t sz); + +/* + * The following functions are only available for internal use. + */ +extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); +extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); +extern void *spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node); +extern void spl_kmem_free_impl(const void *buf, size_t size); +extern void spl_kmem_free_debug(const void *buf, size_t size); +extern void spl_kmem_free_track(const void *buf, size_t size); + +extern int spl_kmem_init(void); +extern void spl_kmem_fini(void); + +#endif /* _SPL_KMEM_H */ diff --git a/include/spl/sys/kmem_cache.h b/include/spl/sys/kmem_cache.h new file mode 100644 index 000000000..8fa14f67e --- /dev/null +++ b/include/spl/sys/kmem_cache.h @@ -0,0 +1,240 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#include <sys/taskq.h> + +/* + * Slab allocation interfaces. The SPL slab differs from the standard + * Linux SLAB or SLUB primarily in that each cache may be backed by slabs + * allocated from the physical or virtal memory address space. The virtual + * slabs allow for good behavior when allocation large objects of identical + * size. This slab implementation also supports both constructors and + * destructors which the Linux slab does not. + */ +enum { + KMC_BIT_NOTOUCH = 0, /* Don't update ages */ + KMC_BIT_NODEBUG = 1, /* Default behavior */ + KMC_BIT_NOMAGAZINE = 2, /* XXX: Unsupported */ + KMC_BIT_NOHASH = 3, /* XXX: Unsupported */ + KMC_BIT_QCACHE = 4, /* XXX: Unsupported */ + KMC_BIT_KMEM = 5, /* Use kmem cache */ + KMC_BIT_VMEM = 6, /* Use vmem cache */ + KMC_BIT_SLAB = 7, /* Use Linux slab cache */ + KMC_BIT_OFFSLAB = 8, /* Objects not on slab */ + KMC_BIT_NOEMERGENCY = 9, /* Disable emergency objects */ + KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ + KMC_BIT_GROWING = 15, /* Growing in progress */ + KMC_BIT_REAPING = 16, /* Reaping in progress */ + KMC_BIT_DESTROY = 17, /* Destroy in progress */ + KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ + KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ + KMC_BIT_MAX = 20, /* Proc handler helper bit */ +}; + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +#define KMC_NOTOUCH (1 << KMC_BIT_NOTOUCH) +#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) +#define KMC_NOMAGAZINE (1 << KMC_BIT_NOMAGAZINE) +#define KMC_NOHASH (1 << KMC_BIT_NOHASH) +#define KMC_QCACHE (1 << KMC_BIT_QCACHE) +#define KMC_KMEM (1 << KMC_BIT_KMEM) +#define KMC_VMEM (1 << KMC_BIT_VMEM) +#define KMC_SLAB (1 << KMC_BIT_SLAB) +#define KMC_OFFSLAB (1 << KMC_BIT_OFFSLAB) +#define KMC_NOEMERGENCY (1 << KMC_BIT_NOEMERGENCY) +#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) +#define KMC_GROWING (1 << KMC_BIT_GROWING) +#define KMC_REAPING (1 << KMC_BIT_REAPING) +#define KMC_DESTROY (1 << KMC_BIT_DESTROY) +#define KMC_TOTAL (1 << KMC_BIT_TOTAL) +#define KMC_ALLOC (1 << KMC_BIT_ALLOC) +#define KMC_MAX (1 << KMC_BIT_MAX) + +#define KMC_REAP_CHUNK INT_MAX +#define KMC_DEFAULT_SEEKS 1 + +#define KMC_EXPIRE_AGE 0x1 /* Due to age */ +#define KMC_EXPIRE_MEM 0x2 /* Due to low memory */ + +#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ + +extern unsigned int spl_kmem_cache_expire; +extern struct list_head spl_kmem_cache_list; +extern struct rw_semaphore spl_kmem_cache_sem; + +#define SKM_MAGIC 0x2e2e2e2e +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c + +#define SPL_KMEM_CACHE_DELAY 15 /* Minimum slab release age */ +#define SPL_KMEM_CACHE_REAP 0 /* Default reap everything */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ +#define SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN 1 /* Minimum objects per slab */ +#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ +#ifdef _LP64 +#define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ +#else +#define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */ +#endif + +#define SPL_MAX_ORDER (MAX_ORDER - 3) +#define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1)) + +#ifdef CONFIG_SLUB +#define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER +#define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1)) +#else +#define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT) +#endif + +#define POINTER_IS_VALID(p) 0 /* Unimplemented */ +#define POINTER_INVALIDATE(pp) /* Unimplemented */ + +typedef int (*spl_kmem_ctor_t)(void *, void *, int); +typedef void (*spl_kmem_dtor_t)(void *, void *); +typedef void (*spl_kmem_reclaim_t)(void *); + +typedef struct spl_kmem_magazine { + uint32_t skm_magic; /* Sanity magic */ + uint32_t skm_avail; /* Available objects */ + uint32_t skm_size; /* Magazine size */ + uint32_t skm_refill; /* Batch refill size */ + struct spl_kmem_cache *skm_cache; /* Owned by cache */ + unsigned long skm_age; /* Last cache access */ + unsigned int skm_cpu; /* Owned by cpu */ + void *skm_objs[0]; /* Object pointers */ +} spl_kmem_magazine_t; + +typedef struct spl_kmem_obj { + uint32_t sko_magic; /* Sanity magic */ + void *sko_addr; /* Buffer address */ + struct spl_kmem_slab *sko_slab; /* Owned by slab */ + struct list_head sko_list; /* Free object list linkage */ +} spl_kmem_obj_t; + +typedef struct spl_kmem_slab { + uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_objs; /* Objects per slab */ + struct spl_kmem_cache *sks_cache; /* Owned by cache */ + struct list_head sks_list; /* Slab list linkage */ + struct list_head sks_free_list; /* Free object list */ + unsigned long sks_age; /* Last modify jiffie */ + uint32_t sks_ref; /* Ref count used objects */ +} spl_kmem_slab_t; + +typedef struct spl_kmem_alloc { + struct spl_kmem_cache *ska_cache; /* Owned by cache */ + int ska_flags; /* Allocation flags */ + taskq_ent_t ska_tqe; /* Task queue entry */ +} spl_kmem_alloc_t; + +typedef struct spl_kmem_emergency { + struct rb_node ske_node; /* Emergency tree linkage */ + unsigned long ske_obj; /* Buffer address */ +} spl_kmem_emergency_t; + +typedef struct spl_kmem_cache { + uint32_t skc_magic; /* Sanity magic */ + uint32_t skc_name_size; /* Name length */ + char *skc_name; /* Name string */ + spl_kmem_magazine_t **skc_mag; /* Per-CPU warm cache */ + uint32_t skc_mag_size; /* Magazine size */ + uint32_t skc_mag_refill; /* Magazine refill count */ + spl_kmem_ctor_t skc_ctor; /* Constructor */ + spl_kmem_dtor_t skc_dtor; /* Destructor */ + spl_kmem_reclaim_t skc_reclaim; /* Reclaimator */ + void *skc_private; /* Private data */ + void *skc_vmp; /* Unused */ + struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ + unsigned long skc_flags; /* Flags */ + uint32_t skc_obj_size; /* Object size */ + uint32_t skc_obj_align; /* Object alignment */ + uint32_t skc_slab_objs; /* Objects per slab */ + uint32_t skc_slab_size; /* Slab size */ + uint32_t skc_delay; /* Slab reclaim interval */ + uint32_t skc_reap; /* Slab reclaim count */ + atomic_t skc_ref; /* Ref count callers */ + taskqid_t skc_taskqid; /* Slab reclaim task */ + struct list_head skc_list; /* List of caches linkage */ + struct list_head skc_complete_list; /* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ + struct rb_root skc_emergency_tree; /* Min sized objects */ + spinlock_t skc_lock; /* Cache lock */ + spl_wait_queue_head_t skc_waitq; /* Allocation waiters */ + uint64_t skc_slab_fail; /* Slab alloc failures */ + uint64_t skc_slab_create; /* Slab creates */ + uint64_t skc_slab_destroy; /* Slab destroys */ + uint64_t skc_slab_total; /* Slab total current */ + uint64_t skc_slab_alloc; /* Slab alloc current */ + uint64_t skc_slab_max; /* Slab max historic */ + uint64_t skc_obj_total; /* Obj total current */ + uint64_t skc_obj_alloc; /* Obj alloc current */ + uint64_t skc_obj_max; /* Obj max historic */ + uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ + uint64_t skc_obj_emergency; /* Obj emergency current */ + uint64_t skc_obj_emergency_max; /* Obj emergency max */ +} spl_kmem_cache_t; +#define kmem_cache_t spl_kmem_cache_t + +extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, + size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, + spl_kmem_reclaim_t reclaim, void *priv, void *vmp, int flags); +extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); +extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); +extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); +extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count); +extern void spl_kmem_reap(void); + +#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ + spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) \ + spl_kmem_cache_reap_now(skc, skc->skc_reap) +#define kmem_reap() spl_kmem_reap() + +/* + * The following functions are only available for internal use. + */ +extern int spl_kmem_cache_init(void); +extern void spl_kmem_cache_fini(void); + +#endif /* _SPL_KMEM_CACHE_H */ diff --git a/include/spl/sys/kobj.h b/include/spl/sys/kobj.h new file mode 100644 index 000000000..558ec39a8 --- /dev/null +++ b/include/spl/sys/kobj.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_KOBJ_H +#define _SPL_KOBJ_H + +#include <sys/vnode.h> + +typedef struct _buf { + vnode_t *vp; +} _buf_t; + +typedef struct _buf buf_t; + +extern struct _buf *kobj_open_file(const char *name); +extern void kobj_close_file(struct _buf *file); +extern int kobj_read_file(struct _buf *file, char *buf, unsigned size, + unsigned off); +extern int kobj_get_filesize(struct _buf *file, uint64_t *size); + +#endif /* SPL_KOBJ_H */ diff --git a/include/spl/sys/kstat.h b/include/spl/sys/kstat.h new file mode 100644 index 000000000..9170fe24e --- /dev/null +++ b/include/spl/sys/kstat.h @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H + +#include <linux/module.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/kmem.h> +#include <sys/mutex.h> +#include <sys/proc.h> + +#define KSTAT_STRLEN 255 +#define KSTAT_RAW_MAX (128*1024) + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 5 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_UNSUPPORTED \ + (KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_WRITABLE | \ + KSTAT_FLAG_PERSISTENT | KSTAT_FLAG_DORMANT) + + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; +typedef struct kstat_s kstat_t; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +typedef struct kstat_module { + char ksm_name[KSTAT_STRLEN+1]; /* module name */ + struct list_head ksm_module_list; /* module linkage */ + struct list_head ksm_kstat_list; /* list of kstat entries */ + struct proc_dir_entry *ksm_proc; /* proc entry */ +} kstat_module_t; + +typedef struct kstat_raw_ops { + int (*headers)(char *buf, size_t size); + int (*data)(char *buf, size_t size, void *data); + void *(*addr)(kstat_t *ksp, loff_t index); +} kstat_raw_ops_t; + +struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + int ks_instance; /* provider module instance */ + char ks_name[KSTAT_STRLEN+1]; /* kstat name */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of data records */ + size_t ks_data_size; /* size of kstat data section */ + struct proc_dir_entry *ks_proc; /* proc linkage */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + kmutex_t ks_private_lock; /* kstat private data lock */ + kmutex_t *ks_lock; /* kstat data lock */ + struct list_head ks_list; /* kstat linkage */ + kstat_module_t *ks_owner; /* kstat module linkage */ + kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ + char *ks_raw_buf; /* buf used for raw ops */ + size_t ks_raw_bufsize; /* size of raw ops buffer */ +}; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len*time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +int spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, + const char *ks_name, const char *ks_class, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags); + +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); + +#define kstat_set_raw_ops(k, h, d, a) \ + __kstat_set_raw_ops(k, h, d, a) +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) + +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) + +#endif /* _SPL_KSTAT_H */ diff --git a/include/spl/sys/list.h b/include/spl/sys/list.h new file mode 100644 index 000000000..74b784e93 --- /dev/null +++ b/include/spl/sys/list.h @@ -0,0 +1,208 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_LIST_H +#define _SPL_LIST_H + +#include <sys/types.h> +#include <linux/list.h> + +/* + * NOTE: I have implemented the Solaris list API in terms of the native + * linux API. This has certain advantages in terms of leveraging the linux + * list debugging infrastructure, but it also means that the internals of a + * list differ slightly than on Solaris. This is not a problem as long as + * all callers stick to the published API. The two major differences are: + * + * 1) A list_node_t is mapped to a linux list_head struct which changes + * the name of the list_next/list_prev pointers to next/prev respectively. + * + * 2) A list_node_t which is not attached to a list on Solaris is denoted + * by having its list_next/list_prev pointers set to NULL. Under linux + * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2 + * respectively. At this moment this only impacts the implementation + * of the list_link_init() and list_link_active() functions. + */ + +typedef struct list_head list_node_t; + +typedef struct list { + size_t list_size; + size_t list_offset; + list_node_t list_head; +} list_t; + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) + +static inline int +list_is_empty(list_t *list) +{ + return (list_empty(&list->list_head)); +} + +static inline void +list_link_init(list_node_t *node) +{ + node->next = LIST_POISON1; + node->prev = LIST_POISON2; +} + +static inline void +list_create(list_t *list, size_t size, size_t offset) +{ + list->list_size = size; + list->list_offset = offset; + INIT_LIST_HEAD(&list->list_head); +} + +static inline void +list_destroy(list_t *list) +{ + list_del(&list->list_head); +} + +static inline void +list_insert_head(list_t *list, void *object) +{ + list_add(list_d2l(list, object), &list->list_head); +} + +static inline void +list_insert_tail(list_t *list, void *object) +{ + list_add_tail(list_d2l(list, object), &list->list_head); +} + +static inline void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) + list_insert_head(list, nobject); + else + list_add(list_d2l(list, nobject), list_d2l(list, object)); +} + +static inline void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) + list_insert_tail(list, nobject); + else + list_add_tail(list_d2l(list, nobject), list_d2l(list, object)); +} + +static inline void +list_remove(list_t *list, void *object) +{ + list_del(list_d2l(list, object)); +} + +static inline void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.next; + if (head == &list->list_head) + return (NULL); + + list_del(head); + return (list_object(list, head)); +} + +static inline void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.prev; + if (tail == &list->list_head) + return (NULL); + + list_del(tail); + return (list_object(list, tail)); +} + +static inline void * +list_head(list_t *list) +{ + if (list_is_empty(list)) + return (NULL); + + return (list_object(list, list->list_head.next)); +} + +static inline void * +list_tail(list_t *list) +{ + if (list_is_empty(list)) + return (NULL); + + return (list_object(list, list->list_head.prev)); +} + +static inline void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->next != &list->list_head) + return (list_object(list, node->next)); + + return (NULL); +} + +static inline void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->prev != &list->list_head) + return (list_object(list, node->prev)); + + return (NULL); +} + +static inline int +list_link_active(list_node_t *node) +{ + return (node->next != LIST_POISON1) && (node->prev != LIST_POISON2); +} + +static inline void +spl_list_move_tail(list_t *dst, list_t *src) +{ + list_splice_init(&src->list_head, dst->list_head.prev); +} + +#define list_move_tail(dst, src) spl_list_move_tail(dst, src) + +static inline void +list_link_replace(list_node_t *old_node, list_node_t *new_node) +{ + new_node->next = old_node->next; + new_node->prev = old_node->prev; + old_node->prev->next = new_node; + old_node->next->prev = new_node; + list_link_init(old_node); +} + +#endif /* SPL_LIST_H */ diff --git a/include/spl/sys/mode.h b/include/spl/sys/mode.h new file mode 100644 index 000000000..02802d0d4 --- /dev/null +++ b/include/spl/sys/mode.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_MODE_H +#define _SPL_MODE_H + +#define IFTOVT(mode) vn_mode_to_vtype(mode) +#define VTTOIF(vtype) vn_vtype_to_mode(vtype) +#define MAKEIMODE(T, M) (VTTOIF(T) | ((M) & ~S_IFMT)) + +#endif /* SPL_MODE_H */ diff --git a/include/spl/sys/mutex.h b/include/spl/sys/mutex.h new file mode 100644 index 000000000..f906d49d4 --- /dev/null +++ b/include/spl/sys/mutex.h @@ -0,0 +1,184 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_MUTEX_H +#define _SPL_MUTEX_H + +#include <sys/types.h> +#include <linux/mutex.h> +#include <linux/lockdep.h> + +typedef enum { + MUTEX_DEFAULT = 0, + MUTEX_SPIN = 1, + MUTEX_ADAPTIVE = 2, + MUTEX_NOLOCKDEP = 3 +} kmutex_type_t; + +typedef struct { + struct mutex m_mutex; + spinlock_t m_lock; /* used for serializing mutex_exit */ + kthread_t *m_owner; +#ifdef CONFIG_LOCKDEP + kmutex_type_t m_type; +#endif /* CONFIG_LOCKDEP */ +} kmutex_t; + +#define MUTEX(mp) (&((mp)->m_mutex)) + +static inline void +spl_mutex_set_owner(kmutex_t *mp) +{ + mp->m_owner = current; +} + +static inline void +spl_mutex_clear_owner(kmutex_t *mp) +{ + mp->m_owner = NULL; +} + +#define mutex_owner(mp) (ACCESS_ONCE((mp)->m_owner)) +#define mutex_owned(mp) (mutex_owner(mp) == current) +#define MUTEX_HELD(mp) mutex_owned(mp) +#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp)) + +#ifdef CONFIG_LOCKDEP +static inline void +spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type) +{ + mp->m_type = type; +} +static inline void +spl_mutex_lockdep_off_maybe(kmutex_t *mp) \ +{ \ + if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ + lockdep_off(); \ +} +static inline void +spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ +{ \ + if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ + lockdep_on(); \ +} +#else /* CONFIG_LOCKDEP */ +#define spl_mutex_set_type(mp, type) +#define spl_mutex_lockdep_off_maybe(mp) +#define spl_mutex_lockdep_on_maybe(mp) +#endif /* CONFIG_LOCKDEP */ + +/* + * The following functions must be a #define and not static inline. + * This ensures that the native linux mutex functions (lock/unlock) + * will be correctly located in the users code which is important + * for the built in kernel lock analysis tools + */ +#undef mutex_init +#define mutex_init(mp, name, type, ibc) \ +{ \ + static struct lock_class_key __key; \ + ASSERT(type == MUTEX_DEFAULT || type == MUTEX_NOLOCKDEP); \ + \ + __mutex_init(MUTEX(mp), (name) ? (#name) : (#mp), &__key); \ + spin_lock_init(&(mp)->m_lock); \ + spl_mutex_clear_owner(mp); \ + spl_mutex_set_type(mp, type); \ +} + +#undef mutex_destroy +#define mutex_destroy(mp) \ +{ \ + VERIFY3P(mutex_owner(mp), ==, NULL); \ +} + +/* BEGIN CSTYLED */ +#define mutex_tryenter(mp) \ +({ \ + int _rc_; \ + \ + spl_mutex_lockdep_off_maybe(mp); \ + if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \ + spl_mutex_set_owner(mp); \ + spl_mutex_lockdep_on_maybe(mp); \ + \ + _rc_; \ +}) +/* END CSTYLED */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define mutex_enter_nested(mp, subclass) \ +{ \ + ASSERT3P(mutex_owner(mp), !=, current); \ + spl_mutex_lockdep_off_maybe(mp); \ + mutex_lock_nested(MUTEX(mp), (subclass)); \ + spl_mutex_lockdep_on_maybe(mp); \ + spl_mutex_set_owner(mp); \ +} +#else /* CONFIG_DEBUG_LOCK_ALLOC */ +#define mutex_enter_nested(mp, subclass) \ +{ \ + ASSERT3P(mutex_owner(mp), !=, current); \ + spl_mutex_lockdep_off_maybe(mp); \ + mutex_lock(MUTEX(mp)); \ + spl_mutex_lockdep_on_maybe(mp); \ + spl_mutex_set_owner(mp); \ +} +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +#define mutex_enter(mp) mutex_enter_nested((mp), 0) + +/* + * The reason for the spinlock: + * + * The Linux mutex is designed with a fast-path/slow-path design such that it + * does not guarantee serialization upon itself, allowing a race where latter + * acquirers finish mutex_unlock before former ones. + * + * The race renders it unsafe to be used for serializing the freeing of an + * object in which the mutex is embedded, where the latter acquirer could go + * on to free the object while the former one is still doing mutex_unlock and + * causing memory corruption. + * + * However, there are many places in ZFS where the mutex is used for + * serializing object freeing, and the code is shared among other OSes without + * this issue. Thus, we need the spinlock to force the serialization on + * mutex_exit(). + * + * See http://lwn.net/Articles/575477/ for the information about the race. + */ +#define mutex_exit(mp) \ +{ \ + spl_mutex_clear_owner(mp); \ + spin_lock(&(mp)->m_lock); \ + spl_mutex_lockdep_off_maybe(mp); \ + mutex_unlock(MUTEX(mp)); \ + spl_mutex_lockdep_on_maybe(mp); \ + spin_unlock(&(mp)->m_lock); \ + /* NOTE: do not dereference mp after this point */ \ +} + +int spl_mutex_init(void); +void spl_mutex_fini(void); + +#endif /* _SPL_MUTEX_H */ diff --git a/include/spl/sys/param.h b/include/spl/sys/param.h new file mode 100644 index 000000000..4ef929151 --- /dev/null +++ b/include/spl/sys/param.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_PARAM_H +#define _SPL_PARAM_H + +#include <asm/page.h> + +/* Pages to bytes and back */ +#define ptob(pages) ((pages) << PAGE_SHIFT) +#define btop(bytes) ((bytes) >> PAGE_SHIFT) + +#define MAXUID UINT32_MAX + +#endif /* SPL_PARAM_H */ diff --git a/include/spl/sys/proc.h b/include/spl/sys/proc.h new file mode 100644 index 000000000..287683920 --- /dev/null +++ b/include/spl/sys/proc.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_PROC_H +#define _SPL_PROC_H + +#include <linux/proc_fs.h> + +extern struct proc_dir_entry *proc_spl_kstat; + +int spl_proc_init(void); +void spl_proc_fini(void); + +#endif /* SPL_PROC_H */ diff --git a/include/spl/sys/processor.h b/include/spl/sys/processor.h new file mode 100644 index 000000000..a70101fa2 --- /dev/null +++ b/include/spl/sys/processor.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_PROCESSOR_H +#define _SPL_PROCESSOR_H + +#define getcpuid() smp_processor_id() + +typedef int processorid_t; + +#endif /* _SPL_PROCESSOR_H */ diff --git a/include/spl/sys/random.h b/include/spl/sys/random.h new file mode 100644 index 000000000..93e244f56 --- /dev/null +++ b/include/spl/sys/random.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_RANDOM_H +#define _SPL_RANDOM_H + +#include <linux/module.h> +#include <linux/random.h> + +static __inline__ int +random_get_bytes(uint8_t *ptr, size_t len) +{ + get_random_bytes((void *)ptr, (int)len); + return (0); +} + +extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); + +#endif /* _SPL_RANDOM_H */ diff --git a/include/spl/sys/rwlock.h b/include/spl/sys/rwlock.h new file mode 100644 index 000000000..b44ceab66 --- /dev/null +++ b/include/spl/sys/rwlock.h @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_RWLOCK_H +#define _SPL_RWLOCK_H + +#include <sys/types.h> +#include <linux/rwsem.h> + +/* Linux kernel compatibility */ +#if defined(CONFIG_PREEMPT_RT_FULL) +#define SPL_RWSEM_SINGLE_READER_VALUE (1) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (0) +#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +#define SPL_RWSEM_SINGLE_READER_VALUE (1) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1) +#else +#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS) +#endif + +/* Linux 3.16 changed activity to count for rwsem-spinlock */ +#if defined(CONFIG_PREEMPT_RT_FULL) +#define RWSEM_COUNT(sem) sem->read_depth +#elif defined(HAVE_RWSEM_ACTIVITY) +#define RWSEM_COUNT(sem) sem->activity +/* Linux 4.8 changed count to an atomic_long_t for !rwsem-spinlock */ +#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) +#define RWSEM_COUNT(sem) atomic_long_read(&(sem)->count) +#else +#define RWSEM_COUNT(sem) sem->count +#endif + +#if defined(RWSEM_SPINLOCK_IS_RAW) +#define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) +#define spl_rwsem_unlock_irqrestore(lk, fl) \ + raw_spin_unlock_irqrestore(lk, fl) +#define spl_rwsem_trylock_irqsave(lk, fl) raw_spin_trylock_irqsave(lk, fl) +#else +#define spl_rwsem_lock_irqsave(lk, fl) spin_lock_irqsave(lk, fl) +#define spl_rwsem_unlock_irqrestore(lk, fl) spin_unlock_irqrestore(lk, fl) +#define spl_rwsem_trylock_irqsave(lk, fl) spin_trylock_irqsave(lk, fl) +#endif /* RWSEM_SPINLOCK_IS_RAW */ + +#define spl_rwsem_is_locked(rwsem) rwsem_is_locked(rwsem) + +typedef enum { + RW_DRIVER = 2, + RW_DEFAULT = 4, + RW_NOLOCKDEP = 5 +} krw_type_t; + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +/* + * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, rw_semaphore will have an owner + * field, so we don't need our own. + */ +typedef struct { + struct rw_semaphore rw_rwlock; +#ifndef CONFIG_RWSEM_SPIN_ON_OWNER + kthread_t *rw_owner; +#endif +#ifdef CONFIG_LOCKDEP + krw_type_t rw_type; +#endif /* CONFIG_LOCKDEP */ +} krwlock_t; + +#define SEM(rwp) (&(rwp)->rw_rwlock) + +static inline void +spl_rw_set_owner(krwlock_t *rwp) +{ +/* + * If CONFIG_RWSEM_SPIN_ON_OWNER is defined, down_write, up_write, + * downgrade_write and __init_rwsem will set/clear owner for us. + */ +#ifndef CONFIG_RWSEM_SPIN_ON_OWNER + rwp->rw_owner = current; +#endif +} + +static inline void +spl_rw_clear_owner(krwlock_t *rwp) +{ +#ifndef CONFIG_RWSEM_SPIN_ON_OWNER + rwp->rw_owner = NULL; +#endif +} + +static inline kthread_t * +rw_owner(krwlock_t *rwp) +{ +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + return (SEM(rwp)->owner); +#else + return (rwp->rw_owner); +#endif +} + +#ifdef CONFIG_LOCKDEP +static inline void +spl_rw_set_type(krwlock_t *rwp, krw_type_t type) +{ + rwp->rw_type = type; +} +static inline void +spl_rw_lockdep_off_maybe(krwlock_t *rwp) \ +{ \ + if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ + lockdep_off(); \ +} +static inline void +spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ +{ \ + if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ + lockdep_on(); \ +} +#else /* CONFIG_LOCKDEP */ +#define spl_rw_set_type(rwp, type) +#define spl_rw_lockdep_off_maybe(rwp) +#define spl_rw_lockdep_on_maybe(rwp) +#endif /* CONFIG_LOCKDEP */ + +static inline int +RW_READ_HELD(krwlock_t *rwp) +{ + /* + * Linux 4.8 will set owner to 1 when read held instead of leave it + * NULL. So we check whether owner <= 1. + */ + return (spl_rwsem_is_locked(SEM(rwp)) && + (unsigned long)rw_owner(rwp) <= 1); +} + +static inline int +RW_WRITE_HELD(krwlock_t *rwp) +{ + return (rw_owner(rwp) == current); +} + +static inline int +RW_LOCK_HELD(krwlock_t *rwp) +{ + return (spl_rwsem_is_locked(SEM(rwp))); +} + +/* + * The following functions must be a #define and not static inline. + * This ensures that the native linux semaphore functions (down/up) + * will be correctly located in the users code which is important + * for the built in kernel lock analysis tools + */ +/* BEGIN CSTYLED */ +#define rw_init(rwp, name, type, arg) \ +({ \ + static struct lock_class_key __key; \ + ASSERT(type == RW_DEFAULT || type == RW_NOLOCKDEP); \ + \ + __init_rwsem(SEM(rwp), #rwp, &__key); \ + spl_rw_clear_owner(rwp); \ + spl_rw_set_type(rwp, type); \ +}) + +#define rw_destroy(rwp) \ +({ \ + VERIFY(!RW_LOCK_HELD(rwp)); \ +}) + +#define rw_tryenter(rwp, rw) \ +({ \ + int _rc_ = 0; \ + \ + spl_rw_lockdep_off_maybe(rwp); \ + switch (rw) { \ + case RW_READER: \ + _rc_ = down_read_trylock(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + if ((_rc_ = down_write_trylock(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ + _rc_; \ +}) + +#define rw_enter(rwp, rw) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + switch (rw) { \ + case RW_READER: \ + down_read(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + down_write(SEM(rwp)); \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_exit(rwp) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + if (RW_WRITE_HELD(rwp)) { \ + spl_rw_clear_owner(rwp); \ + up_write(SEM(rwp)); \ + } else { \ + ASSERT(RW_READ_HELD(rwp)); \ + up_read(SEM(rwp)); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_downgrade(rwp) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + spl_rw_clear_owner(rwp); \ + downgrade_write(SEM(rwp)); \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_tryupgrade(rwp) \ +({ \ + int _rc_ = 0; \ + \ + if (RW_WRITE_HELD(rwp)) { \ + _rc_ = 1; \ + } else { \ + spl_rw_lockdep_off_maybe(rwp); \ + if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + spl_rw_lockdep_on_maybe(rwp); \ + } \ + _rc_; \ +}) +/* END CSTYLED */ + +int spl_rw_init(void); +void spl_rw_fini(void); +int rwsem_tryupgrade(struct rw_semaphore *rwsem); + +#endif /* _SPL_RWLOCK_H */ diff --git a/include/spl/sys/shrinker.h b/include/spl/sys/shrinker.h new file mode 100644 index 000000000..28c1fa78c --- /dev/null +++ b/include/spl/sys/shrinker.h @@ -0,0 +1,209 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_SHRINKER_H +#define _SPL_SHRINKER_H + +#include <linux/mm.h> +#include <linux/fs.h> + +#if !defined(HAVE_SHRINK_CONTROL_STRUCT) +struct shrink_control { + gfp_t gfp_mask; + unsigned long nr_to_scan; +}; +#endif /* HAVE_SHRINK_CONTROL_STRUCT */ + +/* + * Due to frequent changes in the shrinker API the following + * compatibility wrappers should be used. They are as follows: + * + * SPL_SHRINKER_DECLARE is used to declare the shrinker which is + * passed to spl_register_shrinker()/spl_unregister_shrinker(). Use + * shrinker_name to set the shrinker variable name, shrinker_callback + * to set the callback function, and seek_cost to define the cost of + * reclaiming an object. + * + * SPL_SHRINKER_DECLARE(shrinker_name, shrinker_callback, seek_cost); + * + * SPL_SHRINKER_CALLBACK_FWD_DECLARE is used when a forward declaration + * of the shrinker callback function is required. Only the callback + * function needs to be passed. + * + * SPL_SHRINKER_CALLBACK_FWD_DECLARE(shrinker_callback); + * + * SPL_SHRINKER_CALLBACK_WRAPPER is used to declare the callback function + * which is registered with the shrinker. This function will call your + * custom shrinker which must use the following prototype. Notice the + * leading __'s, these must be appended to the callback_function name. + * + * int __shrinker_callback(struct shrinker *, struct shrink_control *) + * SPL_SHRINKER_CALLBACK_WRAPPER(shrinker_callback);a + * + * + * Example: + * + * SPL_SHRINKER_CALLBACK_FWD_DECLARE(my_shrinker_fn); + * SPL_SHRINKER_DECLARE(my_shrinker, my_shrinker_fn, 1); + * + * static int + * __my_shrinker_fn(struct shrinker *shrink, struct shrink_control *sc) + * { + * if (sc->nr_to_scan) { + * ...scan objects in the cache and reclaim them... + * } + * + * ...calculate number of objects in the cache... + * + * return (number of objects in the cache); + * } + * SPL_SHRINKER_CALLBACK_WRAPPER(my_shrinker_fn); + */ + +#define spl_register_shrinker(x) register_shrinker(x) +#define spl_unregister_shrinker(x) unregister_shrinker(x) + +/* + * Linux 2.6.23 - 2.6.34 Shrinker API Compatibility. + */ +#if defined(HAVE_2ARGS_OLD_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(s, x, y) \ +static struct shrinker s = { \ + .shrink = x, \ + .seeks = y \ +} + +#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ +static int fn(int nr_to_scan, unsigned int gfp_mask) + +#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ +static int \ +fn(int nr_to_scan, unsigned int gfp_mask) \ +{ \ + struct shrink_control sc; \ + \ + sc.nr_to_scan = nr_to_scan; \ + sc.gfp_mask = gfp_mask; \ + \ + return (__ ## fn(NULL, &sc)); \ +} + +/* + * Linux 2.6.35 to 2.6.39 Shrinker API Compatibility. + */ +#elif defined(HAVE_3ARGS_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(s, x, y) \ +static struct shrinker s = { \ + .shrink = x, \ + .seeks = y \ +} + +#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ +static int fn(struct shrinker *, int, unsigned int) + +#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ +static int \ +fn(struct shrinker *shrink, int nr_to_scan, unsigned int gfp_mask) \ +{ \ + struct shrink_control sc; \ + \ + sc.nr_to_scan = nr_to_scan; \ + sc.gfp_mask = gfp_mask; \ + \ + return (__ ## fn(shrink, &sc)); \ +} + +/* + * Linux 3.0 to 3.11 Shrinker API Compatibility. + */ +#elif defined(HAVE_2ARGS_NEW_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(s, x, y) \ +static struct shrinker s = { \ + .shrink = x, \ + .seeks = y \ +} + +#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ +static int fn(struct shrinker *, struct shrink_control *) + +#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ +static int \ +fn(struct shrinker *shrink, struct shrink_control *sc) \ +{ \ + return (__ ## fn(shrink, sc)); \ +} + +/* + * Linux 3.12 and later Shrinker API Compatibility. + */ +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(s, x, y) \ +static struct shrinker s = { \ + .count_objects = x ## _count_objects, \ + .scan_objects = x ## _scan_objects, \ + .seeks = y \ +} + +#define SPL_SHRINKER_CALLBACK_FWD_DECLARE(fn) \ +static unsigned long fn ## _count_objects(struct shrinker *, \ + struct shrink_control *); \ +static unsigned long fn ## _scan_objects(struct shrinker *, \ + struct shrink_control *) + +#define SPL_SHRINKER_CALLBACK_WRAPPER(fn) \ +static unsigned long \ +fn ## _count_objects(struct shrinker *shrink, struct shrink_control *sc)\ +{ \ + int __ret__; \ + \ + sc->nr_to_scan = 0; \ + __ret__ = __ ## fn(NULL, sc); \ + \ + /* Errors may not be returned and must be converted to zeros */ \ + return ((__ret__ < 0) ? 0 : __ret__); \ +} \ + \ +static unsigned long \ +fn ## _scan_objects(struct shrinker *shrink, struct shrink_control *sc) \ +{ \ + int __ret__; \ + \ + __ret__ = __ ## fn(NULL, sc); \ + return ((__ret__ < 0) ? SHRINK_STOP : __ret__); \ +} +#else +/* + * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. + */ +#error "Unknown shrinker callback" +#endif + +#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) +typedef unsigned long spl_shrinker_t; +#else +typedef int spl_shrinker_t; +#define SHRINK_STOP (-1) +#endif + +#endif /* SPL_SHRINKER_H */ diff --git a/include/spl/sys/sid.h b/include/spl/sys/sid.h new file mode 100644 index 000000000..731b62c47 --- /dev/null +++ b/include/spl/sys/sid.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_SID_H +#define _SPL_SID_H + +typedef struct ksiddomain { + char *kd_name; +} ksiddomain_t; + +typedef enum ksid_index { + KSID_USER, + KSID_GROUP, + KSID_OWNER, + KSID_COUNT +} ksid_index_t; + +typedef int ksid_t; + +static inline ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + int len = strlen(dom); + + kd = kmem_zalloc(sizeof (ksiddomain_t), KM_SLEEP); + kd->kd_name = kmem_zalloc(len + 1, KM_SLEEP); + memcpy(kd->kd_name, dom, len); + + return (kd); +} + +static inline void +ksiddomain_rele(ksiddomain_t *ksid) +{ + kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1); + kmem_free(ksid, sizeof (ksiddomain_t)); +} + +#endif /* _SPL_SID_H */ diff --git a/include/spl/sys/signal.h b/include/spl/sys/signal.h new file mode 100644 index 000000000..36b8b5d98 --- /dev/null +++ b/include/spl/sys/signal.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_SIGNAL_H +#define _SPL_SIGNAL_H + +#include <linux/sched.h> + +#ifdef HAVE_SCHED_SIGNAL_HEADER +#include <linux/sched/signal.h> +#endif + +#define FORREAL 0 /* Usual side-effects */ +#define JUSTLOOKING 1 /* Don't stop the process */ + +/* + * The "why" argument indicates the allowable side-effects of the call: + * + * FORREAL: Extract the next pending signal from p_sig into p_cursig; + * stop the process if a stop has been requested or if a traced signal + * is pending. + * + * JUSTLOOKING: Don't stop the process, just indicate whether or not + * a signal might be pending (FORREAL is needed to tell for sure). + */ +static __inline__ int +issig(int why) +{ + ASSERT(why == FORREAL || why == JUSTLOOKING); + + return (signal_pending(current)); +} + +#endif /* SPL_SIGNAL_H */ diff --git a/include/spl/sys/stat.h b/include/spl/sys/stat.h new file mode 100644 index 000000000..83018e894 --- /dev/null +++ b/include/spl/sys/stat.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_STAT_H +#define _SPL_STAT_H + +#include <linux/stat.h> + +#endif /* SPL_STAT_H */ diff --git a/include/spl/sys/strings.h b/include/spl/sys/strings.h new file mode 100644 index 000000000..4fb803206 --- /dev/null +++ b/include/spl/sys/strings.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2018 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ +#ifndef _SPL_SYS_STRINGS_H +#define _SPL_SYS_STRINGS_H + +#include <linux/string.h> + +#define bzero(ptr, size) memset(ptr, 0, size) +#define bcopy(src, dest, size) memmove(dest, src, size) +#define bcmp(src, dest, size) memcmp((src), (dest), (size_t)(size)) + +#endif /* _SPL_SYS_STRINGS_H */ diff --git a/include/spl/sys/sunddi.h b/include/spl/sys/sunddi.h new file mode 100644 index 000000000..29a6fe00d --- /dev/null +++ b/include/spl/sys/sunddi.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include <sys/cred.h> +#include <sys/uio.h> +#include <sys/mutex.h> +#include <sys/u8_textprep.h> +#include <sys/vnode.h> + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +extern int ddi_strtoul(const char *, char **, int, unsigned long *); +extern int ddi_strtol(const char *, char **, int, long *); +extern int ddi_strtoull(const char *, char **, int, unsigned long long *); +extern int ddi_strtoll(const char *, char **, int, long long *); + +extern int ddi_copyin(const void *from, void *to, size_t len, int flags); +extern int ddi_copyout(const void *from, void *to, size_t len, int flags); + +#endif /* SPL_SUNDDI_H */ diff --git a/include/spl/sys/sysmacros.h b/include/spl/sys/sysmacros.h new file mode 100644 index 000000000..839e7fd8c --- /dev/null +++ b/include/spl/sys/sysmacros.h @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_SYSMACROS_H +#define _SPL_SYSMACROS_H + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <sys/debug.h> +#include <sys/zone.h> +#include <sys/signal.h> +#include <asm/page.h> + +#ifdef HAVE_SCHED_RT_HEADER +#include <linux/sched/rt.h> +#endif + +#ifndef _KERNEL +#define _KERNEL __KERNEL__ +#endif + +#define FALSE 0 +#define TRUE 1 + +#define INT8_MAX (127) +#define INT8_MIN (-128) +#define UINT8_MAX (255) +#define UINT8_MIN (0) + +#define INT16_MAX (32767) +#define INT16_MIN (-32768) +#define UINT16_MAX (65535) +#define UINT16_MIN (0) + +#define INT32_MAX INT_MAX +#define INT32_MIN INT_MIN +#define UINT32_MAX UINT_MAX +#define UINT32_MIN UINT_MIN + +#define INT64_MAX LLONG_MAX +#define INT64_MIN LLONG_MIN +#define UINT64_MAX ULLONG_MAX +#define UINT64_MIN ULLONG_MIN + +#define NBBY 8 + +#define MAXMSGLEN 256 +#define MAXNAMELEN 256 +#define MAXPATHLEN 4096 +#define MAXOFFSET_T LLONG_MAX +#define MAXBSIZE 8192 +#define DEV_BSIZE 512 +#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ + +#define proc_pageout NULL +#define curproc current +#define max_ncpus num_possible_cpus() +#define boot_ncpus num_online_cpus() +#define CPU_SEQID smp_processor_id() +#define _NOTE(x) +#define is_system_labeled() 0 + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +/* + * 0..MAX_PRIO-1: Process priority + * 0..MAX_RT_PRIO-1: RT priority tasks + * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks + * + * Treat shim tasks as SCHED_NORMAL tasks + */ +#define minclsyspri (MAX_PRIO-1) +#define maxclsyspri (MAX_RT_PRIO) +#define defclsyspri (DEFAULT_PRIO) + +#ifndef NICE_TO_PRIO +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#endif +#ifndef PRIO_TO_NICE +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#endif + +/* + * Missing macros + */ +#ifndef PAGESIZE +#define PAGESIZE PAGE_SIZE +#endif + +#ifndef PAGESHIFT +#define PAGESHIFT PAGE_SHIFT +#endif + +/* Dtrace probes do not exist in the linux kernel */ +#ifdef DTRACE_PROBE +#undef DTRACE_PROBE +#endif /* DTRACE_PROBE */ +#define DTRACE_PROBE(a) ((void)0) + +#ifdef DTRACE_PROBE1 +#undef DTRACE_PROBE1 +#endif /* DTRACE_PROBE1 */ +#define DTRACE_PROBE1(a, b, c) ((void)0) + +#ifdef DTRACE_PROBE2 +#undef DTRACE_PROBE2 +#endif /* DTRACE_PROBE2 */ +#define DTRACE_PROBE2(a, b, c, d, e) ((void)0) + +#ifdef DTRACE_PROBE3 +#undef DTRACE_PROBE3 +#endif /* DTRACE_PROBE3 */ +#define DTRACE_PROBE3(a, b, c, d, e, f, g) ((void)0) + +#ifdef DTRACE_PROBE4 +#undef DTRACE_PROBE4 +#endif /* DTRACE_PROBE4 */ +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) ((void)0) + +/* Missing globals */ +extern char spl_version[32]; +extern unsigned long spl_hostid; + +/* Missing misc functions */ +extern uint32_t zone_get_hostid(void *zone); +extern void spl_setup(void); +extern void spl_cleanup(void); + +#define highbit(x) __fls(x) +#define lowbit(x) __ffs(x) + +#define highbit64(x) fls64(x) +#define makedevice(maj, min) makedev(maj, min) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif +#ifndef roundup +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1)) / (y)) +#endif + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + ((((type)(x) - 1) | ((type)(align) - 1)) + 1) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with <stddef.h> version */ + +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#endif /* _SPL_SYSMACROS_H */ diff --git a/include/spl/sys/systeminfo.h b/include/spl/sys/systeminfo.h new file mode 100644 index 000000000..225569158 --- /dev/null +++ b/include/spl/sys/systeminfo.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_SYSTEMINFO_H +#define _SPL_SYSTEMINFO_H + +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ + +/* Supplemental definitions for Linux. */ +#define HW_HOSTID_PATH "/etc/hostid" /* binary configuration file */ +#define HW_HOSTID_MASK 0xFFFFFFFF /* significant hostid bits */ + +#endif /* SPL_SYSTEMINFO_H */ diff --git a/include/spl/sys/taskq.h b/include/spl/sys/taskq.h new file mode 100644 index 000000000..7353367a2 --- /dev/null +++ b/include/spl/sys/taskq.h @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_TASKQ_H +#define _SPL_TASKQ_H + +#include <linux/module.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/interrupt.h> +#include <linux/kthread.h> +#include <sys/types.h> +#include <sys/thread.h> +#include <sys/rwlock.h> +#include <sys/wait.h> + +#define TASKQ_NAMELEN 31 + +#define TASKQ_PREPOPULATE 0x00000001 +#define TASKQ_CPR_SAFE 0x00000002 +#define TASKQ_DYNAMIC 0x00000004 +#define TASKQ_THREADS_CPU_PCT 0x00000008 +#define TASKQ_DC_BATCH 0x00000010 +#define TASKQ_ACTIVE 0x80000000 + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly + * large so as not to conflict with already used GFP_* defines. + */ +#define TQ_SLEEP 0x00000000 +#define TQ_NOSLEEP 0x00000001 +#define TQ_PUSHPAGE 0x00000002 +#define TQ_NOQUEUE 0x01000000 +#define TQ_NOALLOC 0x02000000 +#define TQ_NEW 0x04000000 +#define TQ_FRONT 0x08000000 + +/* + * Reserved taskqid values. + */ +#define TASKQID_INVALID ((taskqid_t)0) +#define TASKQID_INITIAL ((taskqid_t)1) + +/* + * spin_lock(lock) and spin_lock_nested(lock,0) are equivalent, + * so TQ_LOCK_DYNAMIC must not evaluate to 0 + */ +typedef enum tq_lock_role { + TQ_LOCK_GENERAL = 0, + TQ_LOCK_DYNAMIC = 1, +} tq_lock_role_t; + +typedef unsigned long taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq { + spinlock_t tq_lock; /* protects taskq_t */ + char *tq_name; /* taskq name */ + int tq_instance; /* instance of tq_name */ + struct list_head tq_thread_list; /* list of all threads */ + struct list_head tq_active_list; /* list of active threads */ + int tq_nactive; /* # of active threads */ + int tq_nthreads; /* # of existing threads */ + int tq_nspawn; /* # of threads being spawned */ + int tq_maxthreads; /* # of threads maximum */ + int tq_pri; /* priority */ + int tq_minalloc; /* min taskq_ent_t pool size */ + int tq_maxalloc; /* max taskq_ent_t pool size */ + int tq_nalloc; /* cur taskq_ent_t pool size */ + uint_t tq_flags; /* flags */ + taskqid_t tq_next_id; /* next pend/work id */ + taskqid_t tq_lowest_id; /* lowest pend/work id */ + struct list_head tq_free_list; /* free taskq_ent_t's */ + struct list_head tq_pend_list; /* pending taskq_ent_t's */ + struct list_head tq_prio_list; /* priority taskq_ent_t's */ + struct list_head tq_delay_list; /* delayed taskq_ent_t's */ + struct list_head tq_taskqs; /* all taskq_t's */ + spl_wait_queue_head_t tq_work_waitq; /* new work waitq */ + spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */ + tq_lock_role_t tq_lock_class; /* class when taking tq_lock */ +} taskq_t; + +typedef struct taskq_ent { + spinlock_t tqent_lock; + spl_wait_queue_head_t tqent_waitq; + struct timer_list tqent_timer; + struct list_head tqent_list; + taskqid_t tqent_id; + task_func_t *tqent_func; + void *tqent_arg; + taskq_t *tqent_taskq; + uintptr_t tqent_flags; + unsigned long tqent_birth; +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 +#define TQENT_FLAG_CANCEL 0x2 + +typedef struct taskq_thread { + struct list_head tqt_thread_list; + struct list_head tqt_active_list; + struct task_struct *tqt_thread; + taskq_t *tqt_tq; + taskqid_t tqt_id; + taskq_ent_t *tqt_task; + uintptr_t tqt_flags; +} taskq_thread_t; + +/* Global system-wide dynamic task queue available for all consumers */ +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +/* List of all taskqs */ +extern struct list_head tq_list; +extern struct rw_semaphore tq_list_sem; + +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, + uint_t, clock_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); +extern void taskq_wait(taskq_t *); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_member(taskq_t *, kthread_t *); + +#define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \ + taskq_create(name, nthreads, pri, min, max, flags) +#define taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \ + taskq_create(name, nthreads, maxclsyspri, min, max, flags) + +int spl_taskq_init(void); +void spl_taskq_fini(void); + +#endif /* _SPL_TASKQ_H */ diff --git a/include/spl/sys/thread.h b/include/spl/sys/thread.h new file mode 100644 index 000000000..3762717da --- /dev/null +++ b/include/spl/sys/thread.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_THREAD_H +#define _SPL_THREAD_H + +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <sys/tsd.h> + +/* + * Thread interfaces + */ +#define TP_MAGIC 0x53535353 + +#define TS_SLEEP TASK_INTERRUPTIBLE +#define TS_RUN TASK_RUNNING +#define TS_ZOMB EXIT_ZOMBIE +#define TS_STOPPED TASK_STOPPED + +typedef void (*thread_func_t)(void *); + +/* BEGIN CSTYLED */ +#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ + __thread_create(stk, stksize, (thread_func_t)func, \ + #func, arg, len, pp, state, pri) +/* END CSTYLED */ + +#define thread_exit() __thread_exit() +#define thread_join(t) VERIFY(0) +#define curthread current +#define getcomm() current->comm +#define getpid() current->pid + +extern kthread_t *__thread_create(caddr_t stk, size_t stksize, + thread_func_t func, const char *name, void *args, size_t len, proc_t *pp, + int state, pri_t pri); +extern void __thread_exit(void); +extern struct task_struct *spl_kthread_create(int (*func)(void *), + void *data, const char namefmt[], ...); + +extern proc_t p0; + +#endif /* _SPL_THREAD_H */ diff --git a/include/spl/sys/time.h b/include/spl/sys/time.h new file mode 100644 index 000000000..d6aaca913 --- /dev/null +++ b/include/spl/sys/time.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_TIME_H +#define _SPL_TIME_H + +#include <linux/module.h> +#include <linux/time.h> +#include <sys/types.h> +#include <sys/timer.h> + +#if defined(CONFIG_64BIT) +#define TIME_MAX INT64_MAX +#define TIME_MIN INT64_MIN +#else +#define TIME_MAX INT32_MAX +#define TIME_MIN INT32_MIN +#endif + +#define SEC 1 +#define MILLISEC 1000 +#define MICROSEC 1000000 +#define NANOSEC 1000000000 + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +static const int hz = HZ; + +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX) + +static inline void +gethrestime(timestruc_t *now) +{ + *now = current_kernel_time(); +} + +static inline time_t +gethrestime_sec(void) +{ + struct timespec ts; + ts = current_kernel_time(); + return (ts.tv_sec); +} + +static inline hrtime_t +gethrtime(void) +{ + struct timespec now; + getrawmonotonic(&now); + return (((hrtime_t)now.tv_sec * NSEC_PER_SEC) + now.tv_nsec); +} + +#endif /* _SPL_TIME_H */ diff --git a/include/spl/sys/timer.h b/include/spl/sys/timer.h new file mode 100644 index 000000000..a6b134570 --- /dev/null +++ b/include/spl/sys/timer.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_TIMER_H +#define _SPL_TIMER_H + +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/sched.h> +#include <linux/time.h> +#include <linux/timer.h> + +#define lbolt ((clock_t)jiffies) +#define lbolt64 ((int64_t)get_jiffies_64()) + +#define ddi_get_lbolt() ((clock_t)jiffies) +#define ddi_get_lbolt64() ((int64_t)get_jiffies_64()) + +#define ddi_time_before(a, b) (typecheck(clock_t, a) && \ + typecheck(clock_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after(a, b) ddi_time_before(b, a) +#define ddi_time_before_eq(a, b) (!ddi_time_after(a, b)) +#define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a) + +#define ddi_time_before64(a, b) (typecheck(int64_t, a) && \ + typecheck(int64_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after64(a, b) ddi_time_before64(b, a) +#define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b)) +#define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) + +#define delay(ticks) schedule_timeout_uninterruptible(ticks) + +/* usleep_range() introduced in 2.6.36 */ +#ifndef HAVE_USLEEP_RANGE +static inline void +usleep_range(unsigned long min, unsigned long max) +{ + unsigned int min_ms = min / USEC_PER_MSEC; + + if (min >= MAX_UDELAY_MS) + msleep(min_ms); + else + udelay(min); +} +#endif /* HAVE_USLEEP_RANGE */ + +#define SEC_TO_TICK(sec) ((sec) * HZ) +#define MSEC_TO_TICK(ms) msecs_to_jiffies(ms) +#define USEC_TO_TICK(us) usecs_to_jiffies(us) +#define NSEC_TO_TICK(ns) usecs_to_jiffies(ns / NSEC_PER_USEC) + +#endif /* _SPL_TIMER_H */ diff --git a/include/spl/sys/tsd.h b/include/spl/sys/tsd.h new file mode 100644 index 000000000..39a291bf3 --- /dev/null +++ b/include/spl/sys/tsd.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_TSD_H +#define _SPL_TSD_H + +#include <sys/types.h> + +#define TSD_HASH_TABLE_BITS_DEFAULT 9 +#define TSD_KEYS_MAX 32768 +#define DTOR_PID (PID_MAX_LIMIT+1) +#define PID_KEY (TSD_KEYS_MAX+1) + +typedef void (*dtor_func_t)(void *); + +extern int tsd_set(uint_t, void *); +extern void *tsd_get(uint_t); +extern void *tsd_get_by_thread(uint_t, kthread_t *); +extern void tsd_create(uint_t *, dtor_func_t); +extern void tsd_destroy(uint_t *); +extern void tsd_exit(void); + +int spl_tsd_init(void); +void spl_tsd_fini(void); + +#endif /* _SPL_TSD_H */ diff --git a/include/spl/sys/types.h b/include/spl/sys/types.h new file mode 100644 index 000000000..a5b478127 --- /dev/null +++ b/include/spl/sys/types.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_TYPES_H +#define _SPL_TYPES_H + +#include <linux/types.h> + +#ifndef ULLONG_MAX +#define ULLONG_MAX (~0ULL) +#endif + +#ifndef LLONG_MAX +#define LLONG_MAX ((long long)(~0ULL>>1)) +#endif + +typedef enum { + B_FALSE = 0, + B_TRUE = 1 +} boolean_t; + +typedef unsigned char uchar_t; +typedef unsigned short ushort_t; +typedef unsigned int uint_t; +typedef unsigned long ulong_t; +typedef unsigned long long u_longlong_t; +typedef long long longlong_t; + +typedef unsigned long intptr_t; +typedef unsigned long long rlim64_t; + +typedef struct task_struct kthread_t; +typedef struct task_struct proc_t; + +typedef struct timespec timestruc_t; +typedef struct timespec timespec_t; +typedef longlong_t hrtime_t; + +typedef int id_t; +typedef short pri_t; +typedef short index_t; +typedef longlong_t offset_t; +typedef u_longlong_t u_offset_t; +typedef ulong_t pgcnt_t; + +typedef int major_t; +typedef int minor_t; + +#endif /* _SPL_TYPES_H */ diff --git a/include/spl/sys/types32.h b/include/spl/sys/types32.h new file mode 100644 index 000000000..c60ba8c97 --- /dev/null +++ b/include/spl/sys/types32.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +#include <sys/types.h> + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* _SPL_TYPES32_H */ diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h new file mode 100644 index 000000000..64c452b8d --- /dev/null +++ b/include/spl/sys/uio.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_UIO_H +#define _SPL_UIO_H + +#include <linux/uio.h> +#include <linux/blkdev.h> +#include <asm/uaccess.h> +#include <sys/types.h> + +typedef struct iovec iovec_t; + +typedef enum uio_rw { + UIO_READ = 0, + UIO_WRITE = 1, +} uio_rw_t; + +typedef enum uio_seg { + UIO_USERSPACE = 0, + UIO_SYSSPACE = 1, + UIO_USERISPACE = 2, + UIO_BVEC = 3, +} uio_seg_t; + +typedef struct uio { + union { + const struct iovec *uio_iov; + const struct bio_vec *uio_bvec; + }; + int uio_iovcnt; + offset_t uio_loffset; + uio_seg_t uio_segflg; + uint16_t uio_fmode; + uint16_t uio_extflg; + offset_t uio_limit; + ssize_t uio_resid; + size_t uio_skip; +} uio_t; + +typedef struct aio_req { + uio_t *aio_uio; + void *aio_private; +} aio_req_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +#endif /* SPL_UIO_H */ diff --git a/include/spl/sys/user.h b/include/spl/sys/user.h new file mode 100644 index 000000000..b12cb240e --- /dev/null +++ b/include/spl/sys/user.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2015 Cluster Inc. + * Produced at ClusterHQ Inc (cf, DISCLAIMER). + * Written by Richard Yao <[email protected]>. + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_USER_H +#define _SPL_USER_H + +/* + * We have uf_info_t for areleasef(). We implement areleasef() using a global + * linked list of all open file descriptors with the task structs referenced, + * so accessing the correct descriptor from areleasef() only requires knowing + * about the Linux task_struct. Since this is internal to our compatibility + * layer, we make it an opaque type. + * + * XXX: If the descriptor changes under us and we do not do a getf() between + * the change and using it, we would get an incorrect reference. + */ + +struct uf_info; +typedef struct uf_info uf_info_t; + +#define P_FINFO(x) ((uf_info_t *)x) + +#endif /* SPL_USER_H */ diff --git a/include/spl/sys/vfs.h b/include/spl/sys/vfs.h new file mode 100644 index 000000000..0d5e1d51d --- /dev/null +++ b/include/spl/sys/vfs.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_ZFS_H +#define _SPL_ZFS_H + +#include <linux/mount.h> +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/statfs.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/seq_file.h> + +#define MAXFIDSZ 64 + +typedef struct spl_fid { + union { + long fid_pad; + struct { + ushort_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid_t; + +#define fid_len un._fid.len +#define fid_data un._fid.data + +#endif /* SPL_ZFS_H */ diff --git a/include/spl/sys/vmem.h b/include/spl/sys/vmem.h new file mode 100644 index 000000000..a9b12eeb9 --- /dev/null +++ b/include/spl/sys/vmem.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_VMEM_H +#define _SPL_VMEM_H + +#include <sys/kmem.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> + +typedef struct vmem { } vmem_t; + +extern vmem_t *heap_arena; +extern vmem_t *zio_alloc_arena; +extern vmem_t *zio_arena; + +extern size_t vmem_size(vmem_t *vmp, int typemask); + +/* + * Memory allocation interfaces + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +#ifndef VMALLOC_TOTAL +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +#endif + +/* + * vmem_* is an interface to a low level arena-based memory allocator on + * Illumos that is used to allocate virtual address space. The kmem SLAB + * allocator allocates slabs from it. Then the generic allocation functions + * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators. + * + * On Linux, the primary means of doing allocations is via kmalloc(), which + * is similarly layered on top of something called the buddy allocator. The + * buddy allocator is not available to kernel modules, it uses physical + * memory addresses rather than virtual memory addresses and is prone to + * fragmentation. + * + * Linux sets aside a relatively small address space for in-kernel virtual + * memory from which allocations can be done using vmalloc(). It might seem + * like a good idea to use vmalloc() to implement something similar to + * Illumos' allocator. However, this has the following problems: + * + * 1. Page directory table allocations are hard coded to use GFP_KERNEL. + * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using + * vmalloc() will not have proper semantics. + * + * 2. Address space exhaustion is a real issue on 32-bit platforms where + * only a few 100MB are available. The kernel will handle it by spinning + * when it runs out of address space. + * + * 3. All vmalloc() allocations and frees are protected by a single global + * lock which serializes all allocations. + * + * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire + * list. The former will sum the allocations while the latter will print + * them to user space in a way that user space can keep the lock held + * indefinitely. When the total number of mapped allocations is large + * (several 100,000) a large amount of time will be spent waiting on locks. + * + * 5. Linux has a wait_on_bit() locking primitive that assumes physical + * memory is used, it simply does not work on virtual memory. Certain + * Linux structures (e.g. the superblock) use them and might be embedded + * into a structure from Illumos. This makes using Linux virtual memory + * unsafe in certain situations. + * + * It follows that we cannot obtain identical semantics to those on Illumos. + * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in + * such a way that they can be used as drop-in replacements for small vmem_* + * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}() + * to them. + */ + +#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) +#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) +#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) +#define vmem_qcache_reap(ptr) ((void)0) + +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_vmem_free(const void *ptr, size_t sz); + +int spl_vmem_init(void); +void spl_vmem_fini(void); + +#endif /* _SPL_VMEM_H */ diff --git a/include/spl/sys/vmsystm.h b/include/spl/sys/vmsystm.h new file mode 100644 index 000000000..2b48fe0e3 --- /dev/null +++ b/include/spl/sys/vmsystm.h @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_VMSYSTM_H +#define _SPL_VMSYSTM_H + +#include <linux/mmzone.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/vmalloc.h> +#include <sys/types.h> +#include <asm/uaccess.h> + +#define membar_producer() smp_wmb() +#define physmem totalram_pages +#define freemem (nr_free_pages() + \ + global_page_state(NR_INACTIVE_FILE) + \ + global_page_state(NR_INACTIVE_ANON) + \ + global_page_state(NR_SLAB_RECLAIMABLE)) + +#define xcopyin(from, to, size) copy_from_user(to, from, size) +#define xcopyout(from, to, size) copy_to_user(to, from, size) + +static __inline__ int +copyin(const void *from, void *to, size_t len) +{ + /* On error copyin routine returns -1 */ + if (xcopyin(from, to, len)) + return (-1); + + return (0); +} + +static __inline__ int +copyout(const void *from, void *to, size_t len) +{ + /* On error copyout routine returns -1 */ + if (xcopyout(from, to, len)) + return (-1); + + return (0); +} + +static __inline__ int +copyinstr(const void *from, void *to, size_t len, size_t *done) +{ + size_t rc; + + if (len == 0) + return (-ENAMETOOLONG); + + /* XXX: Should return ENAMETOOLONG if 'strlen(from) > len' */ + + memset(to, 0, len); + rc = copyin(from, to, len - 1); + if (done != NULL) + *done = rc; + + return (0); +} + +#endif /* SPL_VMSYSTM_H */ diff --git a/include/spl/sys/vnode.h b/include/spl/sys/vnode.h new file mode 100644 index 000000000..a3f7828e7 --- /dev/null +++ b/include/spl/sys/vnode.h @@ -0,0 +1,204 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_VNODE_H +#define _SPL_VNODE_H + +#include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/fcntl.h> +#include <linux/buffer_head.h> +#include <linux/dcache.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fs_struct.h> +#include <linux/mount.h> +#include <sys/kmem.h> +#include <sys/mutex.h> +#include <sys/types.h> +#include <sys/time.h> +#include <sys/uio.h> +#include <sys/user.h> + +/* + * Prior to linux-2.6.33 only O_DSYNC semantics were implemented and + * they used the O_SYNC flag. As of linux-2.6.33 the this behavior + * was properly split in to O_SYNC and O_DSYNC respectively. + */ +#ifndef O_DSYNC +#define O_DSYNC O_SYNC +#endif + +#define FREAD 1 +#define FWRITE 2 +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FOFFMAX O_LARGEFILE +#define FSYNC O_SYNC +#define FDSYNC O_DSYNC +#define FRSYNC O_SYNC +#define FEXCL O_EXCL +#define FDIRECT O_DIRECT +#define FAPPEND O_APPEND + +#define FNODSYNC 0x10000 /* fsync pseudo flag */ +#define FNOFOLLOW 0x20000 /* don't follow symlinks */ + +#define F_FREESP 11 /* Free file space */ + + +/* + * The vnode AT_ flags are mapped to the Linux ATTR_* flags. + * This allows them to be used safely with an iattr structure. + * The AT_XVATTR flag has been added and mapped to the upper + * bit range to avoid conflicting with the standard Linux set. + */ +#undef AT_UID +#undef AT_GID + +#define AT_MODE ATTR_MODE +#define AT_UID ATTR_UID +#define AT_GID ATTR_GID +#define AT_SIZE ATTR_SIZE +#define AT_ATIME ATTR_ATIME +#define AT_MTIME ATTR_MTIME +#define AT_CTIME ATTR_CTIME + +#define ATTR_XVATTR (1 << 31) +#define AT_XVATTR ATTR_XVATTR + +#define ATTR_IATTR_MASK (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | \ + ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_FILE) + +#define CRCREAT 0x01 +#define RMFILE 0x02 + +#define B_INVAL 0x01 +#define B_TRUNC 0x02 + +#define LOOKUP_DIR 0x01 +#define LOOKUP_XATTR 0x02 +#define CREATE_XATTR_DIR 0x04 +#define ATTR_NOACLCHECK 0x20 + +typedef enum vtype { + VNON = 0, + VREG = 1, + VDIR = 2, + VBLK = 3, + VCHR = 4, + VLNK = 5, + VFIFO = 6, + VDOOR = 7, + VPROC = 8, + VSOCK = 9, + VPORT = 10, + VBAD = 11 +} vtype_t; + +typedef struct vattr { + enum vtype va_type; /* vnode type */ + uint_t va_mask; /* attribute bit-mask */ + ushort_t va_mode; /* acc mode */ + uid_t va_uid; /* owner uid */ + gid_t va_gid; /* owner gid */ + long va_fsid; /* fs id */ + long va_nodeid; /* node # */ + uint32_t va_nlink; /* # links */ + uint64_t va_size; /* file size */ + struct timespec va_atime; /* last acc */ + struct timespec va_mtime; /* last mod */ + struct timespec va_ctime; /* last chg */ + dev_t va_rdev; /* dev */ + uint64_t va_nblocks; /* space used */ + uint32_t va_blksize; /* block size */ + uint32_t va_seq; /* sequence */ + struct dentry *va_dentry; /* dentry to wire */ +} vattr_t; + +typedef struct vnode { + struct file *v_file; + kmutex_t v_lock; /* protects vnode fields */ + uint_t v_flag; /* vnode flags (see below) */ + uint_t v_count; /* reference count */ + void *v_data; /* private data for fs */ + struct vfs *v_vfsp; /* ptr to containing VFS */ + struct stdata *v_stream; /* associated stream */ + enum vtype v_type; /* vnode type */ + dev_t v_rdev; /* device (VCHR, VBLK) */ + gfp_t v_gfp_mask; /* original mapping gfp mask */ +} vnode_t; + +typedef struct vn_file { + int f_fd; /* linux fd for lookup */ + struct task_struct *f_task; /* linux task this fd belongs to */ + struct file *f_file; /* linux file struct */ + atomic_t f_ref; /* ref count */ + kmutex_t f_lock; /* struct lock */ + loff_t f_offset; /* offset */ + vnode_t *f_vnode; /* vnode */ + struct list_head f_list; /* list referenced file_t's */ +} file_t; + +extern vnode_t *vn_alloc(int flag); +void vn_free(vnode_t *vp); +extern vtype_t vn_mode_to_vtype(mode_t); +extern mode_t vn_vtype_to_mode(vtype_t); +extern int vn_open(const char *path, uio_seg_t seg, int flags, int mode, + vnode_t **vpp, int x1, void *x2); +extern int vn_openat(const char *path, uio_seg_t seg, int flags, int mode, + vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd); +extern int vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, + offset_t off, uio_seg_t seg, int x1, rlim64_t x2, + void *x3, ssize_t *residp); +extern int vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4); +extern int vn_seek(vnode_t *vp, offset_t o, offset_t *op, void *ct); + +extern int vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4); +extern int vn_fsync(vnode_t *vp, int flags, void *x3, void *x4); +extern int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, + offset_t offset, void *x6, void *x7); +extern file_t *vn_getf(int fd); +extern void vn_releasef(int fd); +extern void vn_areleasef(int fd, uf_info_t *fip); +extern int vn_set_pwd(const char *filename); + +int spl_vn_init(void); +void spl_vn_fini(void); + +#define VOP_CLOSE vn_close +#define VOP_SEEK vn_seek +#define VOP_GETATTR vn_getattr +#define VOP_FSYNC vn_fsync +#define VOP_SPACE vn_space +#define VOP_PUTPAGE(vp, o, s, f, x1, x2) ((void)0) +#define vn_is_readonly(vp) 0 +#define getf vn_getf +#define releasef vn_releasef +#define areleasef vn_areleasef + +extern vnode_t *rootdir; + +#endif /* SPL_VNODE_H */ diff --git a/include/spl/sys/wait.h b/include/spl/sys/wait.h new file mode 100644 index 000000000..5311ff8b9 --- /dev/null +++ b/include/spl/sys/wait.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2007-2014 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_WAIT_H +#define _SPL_WAIT_H + +#include <linux/sched.h> +#include <linux/wait.h> + +#ifndef HAVE_WAIT_ON_BIT_ACTION +#define spl_wait_on_bit(word, bit, mode) wait_on_bit(word, bit, mode) +#else + +static inline int +spl_bit_wait(void *word) +{ + schedule(); + return (0); +} + +#define spl_wait_on_bit(word, bit, mode) \ + wait_on_bit(word, bit, spl_bit_wait, mode) + +#endif /* HAVE_WAIT_ON_BIT_ACTION */ + +#ifdef HAVE_WAIT_QUEUE_ENTRY_T +typedef wait_queue_head_t spl_wait_queue_head_t; +typedef wait_queue_entry_t spl_wait_queue_entry_t; +#else +typedef wait_queue_head_t spl_wait_queue_head_t; +typedef wait_queue_t spl_wait_queue_entry_t; +#endif + +#endif /* SPL_WAIT_H */ diff --git a/include/spl/sys/zmod.h b/include/spl/sys/zmod.h new file mode 100644 index 000000000..95c1a3ed7 --- /dev/null +++ b/include/spl/sys/zmod.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * z_compress_level/z_uncompress are nearly identical copies of the + * compress2/uncompress functions provided by the official zlib package + * available at http://zlib.net/. The only changes made we to slightly + * adapt the functions called to match the linux kernel implementation + * of zlib. The full zlib license follows: + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + +#ifndef _SPL_ZMOD_H +#define _SPL_ZMOD_H + +#include <sys/types.h> +#include <linux/zlib.h> + +#ifdef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE +#define spl_zlib_deflate_workspacesize(wb, ml) \ + zlib_deflate_workspacesize(wb, ml) +#else +#define spl_zlib_deflate_workspacesize(wb, ml) \ + zlib_deflate_workspacesize() +#endif /* HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */ + +extern int z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level); +extern int z_uncompress(void *dest, size_t *destLen, const void *source, + size_t sourceLen); + +int spl_zlib_init(void); +void spl_zlib_fini(void); + +#endif /* SPL_ZMOD_H */ diff --git a/include/spl/sys/zone.h b/include/spl/sys/zone.h new file mode 100644 index 000000000..b2efd13b8 --- /dev/null +++ b/include/spl/sys/zone.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _SPL_ZONE_H +#define _SPL_ZONE_H + +#include <sys/byteorder.h> + +#define GLOBAL_ZONEID 0 + +#define zone_dataset_visible(x, y) (1) +#define crgetzoneid(x) (GLOBAL_ZONEID) +#define INGLOBALZONE(z) (1) + +#endif /* SPL_ZONE_H */ diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 new file mode 100644 index 000000000..30d9fc754 --- /dev/null +++ b/man/man5/spl-module-parameters.5 @@ -0,0 +1,357 @@ +'\" te +.\" +.\" Copyright 2013 Turbo Fredriksson <[email protected]>. All rights reserved. +.\" +.TH SPL-MODULE-PARAMETERS 5 "Oct 28, 2017" +.SH NAME +spl\-module\-parameters \- SPL module parameters +.SH DESCRIPTION +.sp +.LP +Description of the different parameters to the SPL module. + +.SS "Module parameters" +.sp +.LP + +.sp +.ne 2 +.na +\fBspl_kmem_cache_expire\fR (uint) +.ad +.RS 12n +Cache expiration is part of default Illumos cache behavior. The idea is +that objects in magazines which have not been recently accessed should be +returned to the slabs periodically. This is known as cache aging and +when enabled objects will be typically returned after 15 seconds. +.sp +On the other hand Linux slabs are designed to never move objects back to +the slabs unless there is memory pressure. This is possible because under +Linux the cache will be notified when memory is low and objects can be +released. +.sp +By default only the Linux method is enabled. It has been shown to improve +responsiveness on low memory systems and not negatively impact the performance +of systems with more memory. This policy may be changed by setting the +\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled +concurrently. +.sp +0x01 - Aging (Illumos), 0x02 - Low memory (Linux) +.sp +Default value: \fB0x02\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_kmem_threads\fR (uint) +.ad +.RS 12n +The number of threads created for the spl_kmem_cache task queue. This task +queue is responsible for allocating new slabs for use by the kmem caches. +For the majority of systems and workloads only a small number of threads are +required. +.sp +Default value: \fB4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_reclaim\fR (uint) +.ad +.RS 12n +When this is set it prevents Linux from being able to rapidly reclaim all the +memory held by the kmem caches. This may be useful in circumstances where +it's preferable that Linux reclaim memory from some other subsystem first. +Setting this will increase the likelihood out of memory events on a memory +constrained system. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_obj_per_slab\fR (uint) +.ad +.RS 12n +The preferred number of objects per slab in the cache. In general, a larger +value will increase the caches memory footprint while decreasing the time +required to perform an allocation. Conversely, a smaller value will minimize +the footprint and improve cache reclaim time but individual allocations may +take longer. +.sp +Default value: \fB8\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_obj_per_slab_min\fR (uint) +.ad +.RS 12n +The minimum number of objects allowed per slab. Normally slabs will contain +\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very +large objects it's desirable to only have a few, or even just one, object per +slab. +.sp +Default value: \fB1\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_max_size\fR (uint) +.ad +.RS 12n +The maximum size of a kmem cache slab in MiB. This effectively limits +the maximum cache object size to \fBspl_kmem_cache_max_size\fR / +\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with +object sized larger than this limit. +.sp +Default value: \fB32 (64-bit) or 4 (32-bit)\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_slab_limit\fR (uint) +.ad +.RS 12n +For small objects the Linux slab allocator should be used to make the most +efficient use of the memory. However, large objects are not supported by +the Linux slab and therefore the SPL implementation is preferred. This +value is used to determine the cutoff between a small and large object. +.sp +Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated +using the Linux slab allocator, large objects use the SPL allocator. A +cutoff of 16K was determined to be optimal for architectures using 4K pages. +.sp +Default value: \fB16,384\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_kmem_limit\fR (uint) +.ad +.RS 12n +Depending on the size of a cache object it may be backed by kmalloc()'d +or vmalloc()'d memory. This is because the size of the required allocation +greatly impacts the best way to allocate the memory. +.sp +When objects are small and only a small number of memory pages need to be +allocated, ideally just one, then kmalloc() is very efficient. However, +when allocating multiple pages with kmalloc() it gets increasingly expensive +because the pages must be physically contiguous. +.sp +For this reason we shift to vmalloc() for slabs of large objects which +which removes the need for contiguous pages. We cannot use vmalloc() in +all cases because there is significant locking overhead involved. This +function takes a single global lock over the entire virtual address range +which serializes all allocations. Using slightly different allocation +functions for small and large objects allows us to handle a wide range of +object sizes. +.sp +The \fBspl_kmem_cache_kmem_limit\fR value is used to determine this cutoff +size. One quarter the PAGE_SIZE is used as the default value because +\fBspl_kmem_cache_obj_per_slab\fR defaults to 16. This means that at +most we will need to allocate four contiguous pages. +.sp +Default value: \fBPAGE_SIZE/4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_warn\fR (uint) +.ad +.RS 12n +As a general rule kmem_alloc() allocations should be small, preferably +just a few pages since they must by physically contiguous. Therefore, a +rate limited warning will be printed to the console for any kmem_alloc() +which exceeds a reasonable threshold. +.sp +The default warning threshold is set to eight pages but capped at 32K to +accommodate systems using large pages. This value was selected to be small +enough to ensure the largest allocations are quickly noticed and fixed. +But large enough to avoid logging any warnings when a allocation size is +larger than optimal but not a serious concern. Since this value is tunable, +developers are encouraged to set it lower when testing so any new largish +allocations are quickly caught. These warnings may be disabled by setting +the threshold to zero. +.sp +Default value: \fB32,768\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_max\fR (uint) +.ad +.RS 12n +Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. +Allocations which are marginally smaller than this limit may succeed but +should still be avoided due to the expense of locating a contiguous range +of free pages. Therefore, a maximum kmem size with reasonable safely +margin of 4x is set. Kmem_alloc() allocations larger than this maximum +will quickly fail. Vmem_alloc() allocations less than or equal to this +value will use kmalloc(), but shift to vmalloc() when exceeding this value. +.sp +Default value: \fBKMALLOC_MAX_SIZE/4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_magazine_size\fR (uint) +.ad +.RS 12n +Cache magazines are an optimization designed to minimize the cost of +allocating memory. They do this by keeping a per-cpu cache of recently +freed objects, which can then be reallocated without taking a lock. This +can improve performance on highly contended caches. However, because +objects in magazines will prevent otherwise empty slabs from being +immediately released this may not be ideal for low memory machines. +.sp +For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a +maximum magazine size. When this value is set to 0 the magazine size will +be automatically determined based on the object size. Otherwise magazines +will be limited to 2-256 objects per magazine (i.e per cpu). Magazines +may never be entirely disabled in this implementation. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_hostid\fR (ulong) +.ad +.RS 12n +The system hostid, when set this can be used to uniquely identify a system. +By default this value is set to zero which indicates the hostid is disabled. +It can be explicitly enabled by placing a unique non-zero value in +\fB/etc/hostid/\fR. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_hostid_path\fR (charp) +.ad +.RS 12n +The expected path to locate the system hostid when specified. This value +may be overridden for non-standard configurations. +.sp +Default value: \fB/etc/hostid\fR +.RE + +.sp +.ne 2 +.na +\fBspl_panic_halt\fR (uint) +.ad +.RS 12n +Cause a kernel panic on assertion failures. When not enabled, the thread is +halted to facilitate further debugging. +.sp +Set to a non-zero value to enable. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_kick\fR (uint) +.ad +.RS 12n +Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will +scan all the taskqs. If any of them have a pending task more than 5 seconds old, +it will kick it to spawn more threads. This can be used if you find a rare +deadlock occurs because one or more taskqs didn't spawn a thread when it should. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_bind\fR (int) +.ad +.RS 12n +Bind taskq threads to specific CPUs. When enabled all taskq threads will +be distributed evenly over the available CPUs. By default, this behavior +is disabled to allow the Linux scheduler the maximum flexibility to determine +where a thread should run. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_dynamic\fR (int) +.ad +.RS 12n +Allow dynamic taskqs. When enabled taskqs which set the TASKQ_DYNAMIC flag +will by default create only a single thread. New threads will be created on +demand up to a maximum allowed number to facilitate the completion of +outstanding tasks. Threads which are no longer needed will be promptly +destroyed. By default this behavior is enabled but it can be disabled to +aid performance analysis or troubleshooting. +.sp +Default value: \fB1\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_priority\fR (int) +.ad +.RS 12n +Allow newly created taskq threads to set a non-default scheduler priority. +When enabled the priority specified when a taskq is created will be applied +to all threads created by that taskq. When disabled all threads will use +the default Linux kernel thread priority. By default, this behavior is +enabled. +.sp +Default value: \fB1\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_sequential\fR (int) +.ad +.RS 12n +The number of items a taskq worker thread must handle without interruption +before requesting a new worker thread be spawned. This is used to control +how quickly taskqs ramp up the number of threads processing the queue. +Because Linux thread creation and destruction are relatively inexpensive a +small default value has been selected. This means that normally threads will +be created aggressively which is desirable. Increasing this value will +result in a slower thread creation rate which may be preferable for some +configurations. +.sp +Default value: \fB4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_max_show_tasks\fR (uint) +.ad +.RS 12n +The maximum number of tasks per pending list in each taskq shown in +/proc/spl/{taskq,taskq-all}. Write 0 to turn off the limit. The proc file will +walk the lists with lock held, reading it could cause a lock up if the list +grow too large without limiting the output. "(truncated)" will be shown if the +list is larger than the limit. +.sp +Default value: \fB512\fR +.RE diff --git a/module/spl/THIRDPARTYLICENSE.gplv2 b/module/spl/THIRDPARTYLICENSE.gplv2 new file mode 100644 index 000000000..d159169d1 --- /dev/null +++ b/module/spl/THIRDPARTYLICENSE.gplv2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/module/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/spl/THIRDPARTYLICENSE.gplv2.descrip new file mode 100644 index 000000000..78535a8ee --- /dev/null +++ b/module/spl/THIRDPARTYLICENSE.gplv2.descrip @@ -0,0 +1 @@ +COMPATIBILITY LAYER FOR OPENZFS ON LINUX diff --git a/module/spl/spl-atomic.c b/module/spl/spl-atomic.c new file mode 100644 index 000000000..47ed1886e --- /dev/null +++ b/module/spl/spl-atomic.c @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Atomic Implementation. + */ + +#include <sys/atomic.h> + +#ifdef ATOMIC_SPINLOCK +/* Global atomic lock declarations */ +DEFINE_SPINLOCK(atomic32_lock); +DEFINE_SPINLOCK(atomic64_lock); + +EXPORT_SYMBOL(atomic32_lock); +EXPORT_SYMBOL(atomic64_lock); +#endif /* ATOMIC_SPINLOCK */ diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c new file mode 100644 index 000000000..f0060bbdc --- /dev/null +++ b/module/spl/spl-condvar.c @@ -0,0 +1,410 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include <sys/condvar.h> +#include <sys/time.h> +#include <linux/hrtimer.h> + +void +__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ + ASSERT(cvp); + ASSERT(name == NULL); + ASSERT(type == CV_DEFAULT); + ASSERT(arg == NULL); + + cvp->cv_magic = CV_MAGIC; + init_waitqueue_head(&cvp->cv_event); + init_waitqueue_head(&cvp->cv_destroy); + atomic_set(&cvp->cv_waiters, 0); + atomic_set(&cvp->cv_refs, 1); + cvp->cv_mutex = NULL; +} +EXPORT_SYMBOL(__cv_init); + +static int +cv_destroy_wakeup(kcondvar_t *cvp) +{ + if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { + ASSERT(cvp->cv_mutex == NULL); + ASSERT(!waitqueue_active(&cvp->cv_event)); + return (1); + } + + return (0); +} + +void +__cv_destroy(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + + cvp->cv_magic = CV_DESTROY; + atomic_dec(&cvp->cv_refs); + + /* Block until all waiters are woken and references dropped. */ + while (cv_destroy_wakeup(cvp) == 0) + wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); + + ASSERT3P(cvp->cv_mutex, ==, NULL); + ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); + ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); + ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); +} +EXPORT_SYMBOL(__cv_destroy); + +static void +cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + atomic_inc(&cvp->cv_refs); + + m = ACCESS_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + io_schedule(); + else + schedule(); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); +} + +void +__cv_wait(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(__cv_wait); + +void +__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1); +} +EXPORT_SYMBOL(__cv_wait_io); + +void +__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(__cv_wait_sig); + +#if defined(HAVE_IO_SCHEDULE_TIMEOUT) +#define spl_io_schedule_timeout(t) io_schedule_timeout(t) +#else +static void +__cv_wakeup(unsigned long data) +{ + wake_up_process((struct task_struct *)data); +} + +static long +spl_io_schedule_timeout(long time_left) +{ + long expire_time = jiffies + time_left; + struct timer_list timer; + + init_timer(&timer); + setup_timer(&timer, __cv_wakeup, (unsigned long)current); + timer.expires = expire_time; + add_timer(&timer); + + io_schedule(); + + del_timer_sync(&timer); + time_left = expire_time - jiffies; + + return (time_left < 0 ? 0 : time_left); +} +#endif + +/* + * 'expire_time' argument is an absolute wall clock time in jiffies. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, + int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + clock_t time_left; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + /* XXX - Does not handle jiffie wrap properly */ + time_left = expire_time - jiffies; + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = ACCESS_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + time_left = spl_io_schedule_timeout(time_left); + else + time_left = schedule_timeout(time_left); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); + return (time_left > 0 ? time_left : -1); +} + +clock_t +__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait); + +clock_t +__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 1)); +} +EXPORT_SYMBOL(__cv_timedwait_io); + +clock_t +__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_INTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait_sig); + +/* + * 'expire_time' argument is an absolute clock time in nanoseconds. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, + int state) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + hrtime_t time_left; + ktime_t ktime_left; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + time_left = expire_time - gethrtime(); + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = ACCESS_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + /* + * Allow a 100 us range to give kernel an opportunity to coalesce + * interrupts + */ + ktime_left = ktime_set(0, time_left); + schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC, + HRTIMER_MODE_REL); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + mutex_enter(mp); + time_left = expire_time - gethrtime(); + return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); +} + +/* + * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. + */ +static clock_t +cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag, int state) +{ + if (res > 1) { + /* + * Align expiration to the specified resolution. + */ + if (flag & CALLOUT_FLAG_ROUNDUP) + tim += res - 1; + tim = (tim / res) * res; + } + + if (!(flag & CALLOUT_FLAG_ABSOLUTE)) + tim += gethrtime(); + + return (__cv_timedwait_hires(cvp, mp, tim, state)); +} + +clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_UNINTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_hires); + +clock_t +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_sig_hires); + +void +__cv_signal(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * All waiters are added with WQ_FLAG_EXCLUSIVE so only one + * waiter will be set runable with each call to wake_up(). + * Additionally wake_up() holds a spin_lock assoicated with + * the wait queue to ensure we don't race waking up processes. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_signal); + +void +__cv_broadcast(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * Wake_up_all() will wake up all waiters even those which + * have the WQ_FLAG_EXCLUSIVE flag set. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up_all(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_broadcast); diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c new file mode 100644 index 000000000..ea3e903f9 --- /dev/null +++ b/module/spl/spl-cred.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include <sys/cred.h> + +static int +#ifdef HAVE_KUIDGID_T +cr_groups_search(const struct group_info *group_info, kgid_t grp) +#else +cr_groups_search(const struct group_info *group_info, gid_t grp) +#endif +{ + unsigned int left, right, mid; + int cmp; + + if (!group_info) + return (0); + + left = 0; + right = group_info->ngroups; + while (left < right) { + mid = (left + right) / 2; + cmp = KGID_TO_SGID(grp) - + KGID_TO_SGID(GROUP_AT(group_info, mid)); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return (1); + } + return (0); +} + +/* Hold a reference on the credential */ +void +crhold(cred_t *cr) +{ + (void) get_cred((const cred_t *)cr); +} + +/* Free a reference on the credential */ +void +crfree(cred_t *cr) +{ + put_cred((const cred_t *)cr); +} + +/* Return the number of supplemental groups */ +int +crgetngroups(const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = gi->ngroups; +#ifndef HAVE_GROUP_INFO_GID + /* + * For Linux <= 4.8, + * crgetgroups will only returns gi->blocks[0], which contains only + * the first NGROUPS_PER_BLOCK groups. + */ + if (rc > NGROUPS_PER_BLOCK) { + WARN_ON_ONCE(1); + rc = NGROUPS_PER_BLOCK; + } +#endif + return (rc); +} + +/* + * Return an array of supplemental gids. The returned address is safe + * to use as long as the caller has taken a reference with crhold(). + * + * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d + * array via ->gid. + */ +gid_t * +crgetgroups(const cred_t *cr) +{ + struct group_info *gi; + gid_t *gids = NULL; + + gi = cr->group_info; +#ifdef HAVE_GROUP_INFO_GID + gids = KGIDP_TO_SGIDP(gi->gid); +#else + if (gi->nblocks > 0) + gids = KGIDP_TO_SGIDP(gi->blocks[0]); +#endif + return (gids); +} + +/* Check if the passed gid is available in supplied credential. */ +int +groupmember(gid_t gid, const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = cr_groups_search(gi, SGID_TO_KGID(gid)); + + return (rc); +} + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->euid)); +} + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->uid)); +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->suid)); +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->fsuid)); +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->egid)); +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->gid)); +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->sgid)); +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->fsgid)); +} + +EXPORT_SYMBOL(crhold); +EXPORT_SYMBOL(crfree); +EXPORT_SYMBOL(crgetuid); +EXPORT_SYMBOL(crgetruid); +EXPORT_SYMBOL(crgetsuid); +EXPORT_SYMBOL(crgetfsuid); +EXPORT_SYMBOL(crgetgid); +EXPORT_SYMBOL(crgetrgid); +EXPORT_SYMBOL(crgetsgid); +EXPORT_SYMBOL(crgetfsgid); +EXPORT_SYMBOL(crgetngroups); +EXPORT_SYMBOL(crgetgroups); +EXPORT_SYMBOL(groupmember); diff --git a/module/spl/spl-err.c b/module/spl/spl-err.c new file mode 100644 index 000000000..6b71296e8 --- /dev/null +++ b/module/spl/spl-err.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Error Implementation. + */ + +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <linux/ratelimit.h> + +/* + * It is often useful to actually have the panic crash the node so you + * can then get notified of the event, get the crashdump for later + * analysis and other such goodies. + * But we would still default to the current default of not to do that. + */ +/* BEGIN CSTYLED */ +unsigned int spl_panic_halt; +module_param(spl_panic_halt, uint, 0644); +MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); +/* END CSTYLED */ + +/* + * Limit the number of stack traces dumped to not more than 5 every + * 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(dumpstack_ratelimit_state, 60 * HZ, 5); + +void +spl_dumpstack(void) +{ + if (__ratelimit(&dumpstack_ratelimit_state)) { + printk("Showing stack for process %d\n", current->pid); + dump_stack(); + } +} +EXPORT_SYMBOL(spl_dumpstack); + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char msg[MAXMSGLEN]; + va_list ap; + + newfile = strrchr(file, '/'); + if (newfile != NULL) + newfile = newfile + 1; + else + newfile = file; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + printk(KERN_EMERG "%s", msg); + printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func); + if (spl_panic_halt) + panic("%s", msg); + + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + + /* Unreachable */ + return (1); +} +EXPORT_SYMBOL(spl_panic); + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + printk("%s", msg); + break; + case CE_NOTE: + printk(KERN_NOTICE "NOTICE: %s\n", msg); + break; + case CE_WARN: + printk(KERN_WARNING "WARNING: %s\n", msg); + break; + case CE_PANIC: + printk(KERN_EMERG "PANIC: %s\n", msg); + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + } +} /* vcmn_err() */ +EXPORT_SYMBOL(vcmn_err); + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ +EXPORT_SYMBOL(cmn_err); diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c new file mode 100644 index 000000000..b38fe254c --- /dev/null +++ b/module/spl/spl-generic.c @@ -0,0 +1,775 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Generic Implementation. + */ + +#include <sys/sysmacros.h> +#include <sys/systeminfo.h> +#include <sys/vmsystm.h> +#include <sys/kobj.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/vmem.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/taskq.h> +#include <sys/tsd.h> +#include <sys/zmod.h> +#include <sys/debug.h> +#include <sys/proc.h> +#include <sys/kstat.h> +#include <sys/file.h> +#include <linux/ctype.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/strings.h> +#include <linux/kmod.h> + +char spl_version[32] = "SPL v" SPL_META_VERSION "-" SPL_META_RELEASE; +EXPORT_SYMBOL(spl_version); + +/* BEGIN CSTYLED */ +unsigned long spl_hostid = 0; +EXPORT_SYMBOL(spl_hostid); +module_param(spl_hostid, ulong, 0644); +MODULE_PARM_DESC(spl_hostid, "The system hostid."); +/* END CSTYLED */ + +proc_t p0; +EXPORT_SYMBOL(p0); + +/* + * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * + * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose + * is to provide bytes containing random numbers. It is mapped to /dev/urandom + * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's + * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so + * we can implement it using a fast PRNG that we seed using Linux' actual + * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU + * with an independent seed so that all calls to random_get_pseudo_bytes() are + * free of atomic instructions. + * + * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() + * to generate words larger than 128 bits will paradoxically be limited to + * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` + * 128-bit words and selecting the first will implicitly select the second. If + * a caller finds this behavior undesireable, random_get_bytes() should be used + * instead. + * + * XXX: Linux interrupt handlers that trigger within the critical section + * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * see the same numbers. Nothing in the code currently calls this in an + * interrupt handler, so this is considered to be okay. If that becomes a + * problem, we could create a set of per-cpu variables for interrupt handlers + * and use them when in_interrupt() from linux/preempt_mask.h evaluates to + * true. + */ +static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy); + +/* + * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed + * file: + * + * http://xorshift.di.unimi.it/xorshift128plus.c + */ + +static inline uint64_t +spl_rand_next(uint64_t *s) +{ + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + s[0] = s0; + s1 ^= s1 << 23; // a + s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c + return (s[1] + s0); +} + +static inline void +spl_rand_jump(uint64_t *s) +{ + static const uint64_t JUMP[] = + { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i, b; + for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= s[0]; + s1 ^= s[1]; + } + (void) spl_rand_next(s); + } + + s[0] = s0; + s[1] = s1; +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + uint64_t *xp, s[2]; + + ASSERT(ptr); + + xp = get_cpu_var(spl_pseudo_entropy); + + s[0] = xp[0]; + s[1] = xp[1]; + + while (len) { + union { + uint64_t ui64; + uint8_t byte[sizeof (uint64_t)]; + }entropy; + int i = MIN(len, sizeof (uint64_t)); + + len -= i; + entropy.ui64 = spl_rand_next(s); + + while (i--) + *ptr++ = entropy.byte[i]; + } + + xp[0] = s[0]; + xp[1] = s[1]; + + put_cpu_var(spl_pseudo_entropy); + + return (0); +} + + +EXPORT_SYMBOL(random_get_pseudo_bytes); + +#if BITS_PER_LONG == 32 +/* + * Support 64/64 => 64 division on a 32-bit platform. While the kernel + * provides a div64_u64() function for this we do not use it because the + * implementation is flawed. There are cases which return incorrect + * results as late as linux-2.6.35. Until this is fixed upstream the + * spl must provide its own implementation. + * + * This implementation is a slightly modified version of the algorithm + * proposed by the book 'Hacker's Delight'. The original source can be + * found here and is available for use without restriction. + * + * http://www.hackersdelight.org/HDcode/newCode/divDouble.c + */ + +/* + * Calculate number of leading of zeros for a 64-bit value. + */ +static int +nlz64(uint64_t x) +{ + register int n = 0; + + if (x == 0) + return (64); + + if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } + if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } + if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } + if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } + if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } + if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } + + return (n); +} + +/* + * Newer kernels have a div_u64() function but we define our own + * to simplify portibility between kernel versions. + */ +static inline uint64_t +__div_u64(uint64_t u, uint32_t v) +{ + (void) do_div(u, v); + return (u); +} + +/* + * Implementation of 64-bit unsigned division for 32-bit machines. + * + * First the procedure takes care of the case in which the divisor is a + * 32-bit quantity. There are two subcases: (1) If the left half of the + * dividend is less than the divisor, one execution of do_div() is all that + * is required (overflow is not possible). (2) Otherwise it does two + * divisions, using the grade school method. + */ +uint64_t +__udivdi3(uint64_t u, uint64_t v) +{ + uint64_t u0, u1, v1, q0, q1, k; + int n; + + if (v >> 32 == 0) { // If v < 2**32: + if (u >> 32 < v) { // If u/v cannot overflow, + return (__div_u64(u, v)); // just do one division. + } else { // If u/v would overflow: + u1 = u >> 32; // Break u into two halves. + u0 = u & 0xFFFFFFFF; + q1 = __div_u64(u1, v); // First quotient digit. + k = u1 - q1 * v; // First remainder, < v. + u0 += (k << 32); + q0 = __div_u64(u0, v); // Seconds quotient digit. + return ((q1 << 32) + q0); + } + } else { // If v >= 2**32: + n = nlz64(v); // 0 <= n <= 31. + v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. + u1 = u >> 1; // To ensure no overflow. + q1 = __div_u64(u1, v1); // Get quotient from + q0 = (q1 << n) >> 31; // Undo normalization and + // division of u by 2. + if (q0 != 0) // Make q0 correct or + q0 = q0 - 1; // too small by 1. + if ((u - q0 * v) >= v) + q0 = q0 + 1; // Now q0 is correct. + + return (q0); + } +} +EXPORT_SYMBOL(__udivdi3); + +/* BEGIN CSTYLED */ +#ifndef abs64 +#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) +#endif +/* END CSTYLED */ + +/* + * Implementation of 64-bit signed division for 32-bit machines. + */ +int64_t +__divdi3(int64_t u, int64_t v) +{ + int64_t q, t; + q = __udivdi3(abs64(u), abs64(v)); + t = (u ^ v) >> 63; // If u, v have different + return ((q ^ t) - t); // signs, negate q. +} +EXPORT_SYMBOL(__divdi3); + +/* + * Implementation of 64-bit unsigned modulo for 32-bit machines. + */ +uint64_t +__umoddi3(uint64_t dividend, uint64_t divisor) +{ + return (dividend - (divisor * __udivdi3(dividend, divisor))); +} +EXPORT_SYMBOL(__umoddi3); + +/* + * Implementation of 64-bit unsigned division/modulo for 32-bit machines. + */ +uint64_t +__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) +{ + uint64_t q = __udivdi3(n, d); + if (r) + *r = n - d * q; + return (q); +} +EXPORT_SYMBOL(__udivmoddi4); + +/* + * Implementation of 64-bit signed division/modulo for 32-bit machines. + */ +int64_t +__divmoddi4(int64_t n, int64_t d, int64_t *r) +{ + int64_t q, rr; + boolean_t nn = B_FALSE; + boolean_t nd = B_FALSE; + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) { + nd = B_TRUE; + d = -d; + } + + q = __udivmoddi4(n, d, (uint64_t *)&rr); + + if (nn != nd) + q = -q; + if (nn) + rr = -rr; + if (r) + *r = rr; + return (q); +} +EXPORT_SYMBOL(__divmoddi4); + +#if defined(__arm) || defined(__arm__) +/* + * Implementation of 64-bit (un)signed division for 32-bit arm machines. + * + * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) + * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, + * and the remainder in {r2, r3}. The return type is specifically left + * set to 'void' to ensure the compiler does not overwrite these registers + * during the return. All results are in registers as per ABI + */ +void +__aeabi_uldivmod(uint64_t u, uint64_t v) +{ + uint64_t res; + uint64_t mod; + + res = __udivdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_uldivmod); + +void +__aeabi_ldivmod(int64_t u, int64_t v) +{ + int64_t res; + uint64_t mod; + + res = __divdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_ldivmod); +#endif /* __arm || __arm__ */ +#endif /* BITS_PER_LONG */ + +/* + * NOTE: The strtoxx behavior is solely based on my reading of the Solaris + * ddi_strtol(9F) man page. I have not verified the behavior of these + * functions against their Solaris counterparts. It is possible that I + * may have misinterpreted the man page or the man page is incorrect. + */ +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_strtoull(const char *, char **, int, unsigned long long *); +int ddi_strtoll(const char *, char **, int, long long *); + +#define define_ddi_strtoux(type, valtype) \ +int ddi_strtou##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + valtype last_value, value = 0; \ + char *ptr = (char *)str; \ + int flag = 1, digit; \ + \ + if (strlen(ptr) == 0) \ + return (EINVAL); \ + \ + /* Auto-detect base based on prefix */ \ + if (!base) { \ + if (str[0] == '0') { \ + if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ + base = 16; /* hex */ \ + ptr += 2; \ + } else if (str[1] >= '0' && str[1] < 8) { \ + base = 8; /* octal */ \ + ptr += 1; \ + } else { \ + return (EINVAL); \ + } \ + } else { \ + base = 10; /* decimal */ \ + } \ + } \ + \ + while (1) { \ + if (isdigit(*ptr)) \ + digit = *ptr - '0'; \ + else if (isalpha(*ptr)) \ + digit = tolower(*ptr) - 'a' + 10; \ + else \ + break; \ + \ + if (digit >= base) \ + break; \ + \ + last_value = value; \ + value = value * base + digit; \ + if (last_value > value) /* Overflow */ \ + return (ERANGE); \ + \ + flag = 1; \ + ptr++; \ + } \ + \ + if (flag) \ + *result = value; \ + \ + if (endptr) \ + *endptr = (char *)(flag ? ptr : str); \ + \ + return (0); \ +} \ + +#define define_ddi_strtox(type, valtype) \ +int ddi_strto##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + int rc; \ + \ + if (*str == '-') { \ + rc = ddi_strtou##type(str + 1, endptr, base, result); \ + if (!rc) { \ + if (*endptr == str + 1) \ + *endptr = (char *)str; \ + else \ + *result = -*result; \ + } \ + } else { \ + rc = ddi_strtou##type(str, endptr, base, result); \ + } \ + \ + return (rc); \ +} + +define_ddi_strtoux(l, unsigned long) +define_ddi_strtox(l, long) +define_ddi_strtoux(ll, unsigned long long) +define_ddi_strtox(ll, long long) + +EXPORT_SYMBOL(ddi_strtoul); +EXPORT_SYMBOL(ddi_strtol); +EXPORT_SYMBOL(ddi_strtoll); +EXPORT_SYMBOL(ddi_strtoull); + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyin); + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyout); + +/* + * Read the unique system identifier from the /etc/hostid file. + * + * The behavior of /usr/bin/hostid on Linux systems with the + * regular eglibc and coreutils is: + * + * 1. Generate the value if the /etc/hostid file does not exist + * or if the /etc/hostid file is less than four bytes in size. + * + * 2. If the /etc/hostid file is at least 4 bytes, then return + * the first four bytes [0..3] in native endian order. + * + * 3. Always ignore bytes [4..] if they exist in the file. + * + * Only the first four bytes are significant, even on systems that + * have a 64-bit word size. + * + * See: + * + * eglibc: sysdeps/unix/sysv/linux/gethostid.c + * coreutils: src/hostid.c + * + * Notes: + * + * The /etc/hostid file on Solaris is a text file that often reads: + * + * # DO NOT EDIT + * "0123456789" + * + * Directly copying this file to Linux results in a constant + * hostid of 4f442023 because the default comment constitutes + * the first four bytes of the file. + * + */ + +char *spl_hostid_path = HW_HOSTID_PATH; +module_param(spl_hostid_path, charp, 0444); +MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); + +static int +hostid_read(uint32_t *hostid) +{ + uint64_t size; + struct _buf *file; + uint32_t value = 0; + int error; + + file = kobj_open_file(spl_hostid_path); + if (file == (struct _buf *)-1) + return (ENOENT); + + error = kobj_get_filesize(file, &size); + if (error) { + kobj_close_file(file); + return (error); + } + + if (size < sizeof (HW_HOSTID_MASK)) { + kobj_close_file(file); + return (EINVAL); + } + + /* + * Read directly into the variable like eglibc does. + * Short reads are okay; native behavior is preserved. + */ + error = kobj_read_file(file, (char *)&value, sizeof (value), 0); + if (error < 0) { + kobj_close_file(file); + return (EIO); + } + + /* Mask down to 32 bits like coreutils does. */ + *hostid = (value & HW_HOSTID_MASK); + kobj_close_file(file); + + return (0); +} + +/* + * Return the system hostid. Preferentially use the spl_hostid module option + * when set, otherwise use the value in the /etc/hostid file. + */ +uint32_t +zone_get_hostid(void *zone) +{ + uint32_t hostid; + + ASSERT3P(zone, ==, NULL); + + if (spl_hostid != 0) + return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); + + if (hostid_read(&hostid) == 0) + return (hostid); + + return (0); +} +EXPORT_SYMBOL(zone_get_hostid); + +static int +spl_kvmem_init(void) +{ + int rc = 0; + + rc = spl_kmem_init(); + if (rc) + return (rc); + + rc = spl_vmem_init(); + if (rc) { + spl_kmem_fini(); + return (rc); + } + + return (rc); +} + +/* + * We initialize the random number generator with 128 bits of entropy from the + * system random number generator. In the improbable case that we have a zero + * seed, we fallback to the system jiffies, unless it is also zero, in which + * situation we use a preprogrammed seed. We step forward by 2^64 iterations to + * initialize each of the per-cpu seeds so that the sequences generated on each + * CPU are guaranteed to never overlap in practice. + */ +static void __init +spl_random_init(void) +{ + uint64_t s[2]; + int i; + + get_random_bytes(s, sizeof (s)); + + if (s[0] == 0 && s[1] == 0) { + if (jiffies != 0) { + s[0] = jiffies; + s[1] = ~0 - jiffies; + } else { + (void) memcpy(s, "improbable seed", sizeof (s)); + } + printk("SPL: get_random_bytes() returned 0 " + "when generating random seed. Setting initial seed to " + "0x%016llx%016llx.", cpu_to_be64(s[0]), cpu_to_be64(s[1])); + } + + for_each_possible_cpu(i) { + uint64_t *wordp = per_cpu(spl_pseudo_entropy, i); + + spl_rand_jump(s); + + wordp[0] = s[0]; + wordp[1] = s[1]; + } +} + +static void +spl_kvmem_fini(void) +{ + spl_vmem_fini(); + spl_kmem_fini(); +} + +static int __init +spl_init(void) +{ + int rc = 0; + + bzero(&p0, sizeof (proc_t)); + spl_random_init(); + + if ((rc = spl_kvmem_init())) + goto out1; + + if ((rc = spl_mutex_init())) + goto out2; + + if ((rc = spl_rw_init())) + goto out3; + + if ((rc = spl_tsd_init())) + goto out4; + + if ((rc = spl_taskq_init())) + goto out5; + + if ((rc = spl_kmem_cache_init())) + goto out6; + + if ((rc = spl_vn_init())) + goto out7; + + if ((rc = spl_proc_init())) + goto out8; + + if ((rc = spl_kstat_init())) + goto out9; + + if ((rc = spl_zlib_init())) + goto out10; + + printk(KERN_NOTICE "SPL: Loaded module v%s-%s%s\n", SPL_META_VERSION, + SPL_META_RELEASE, SPL_DEBUG_STR); + return (rc); + +out10: + spl_kstat_fini(); +out9: + spl_proc_fini(); +out8: + spl_vn_fini(); +out7: + spl_kmem_cache_fini(); +out6: + spl_taskq_fini(); +out5: + spl_tsd_fini(); +out4: + spl_rw_fini(); +out3: + spl_mutex_fini(); +out2: + spl_kvmem_fini(); +out1: + printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer " + "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE, + SPL_DEBUG_STR, rc); + + return (rc); +} + +static void __exit +spl_fini(void) +{ + printk(KERN_NOTICE "SPL: Unloaded module v%s-%s%s\n", + SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); + spl_zlib_fini(); + spl_kstat_fini(); + spl_proc_fini(); + spl_vn_fini(); + spl_kmem_cache_fini(); + spl_taskq_fini(); + spl_tsd_fini(); + spl_rw_fini(); + spl_mutex_fini(); + spl_kvmem_fini(); +} + +module_init(spl_init); +module_exit(spl_fini); + +MODULE_DESCRIPTION("Solaris Porting Layer"); +MODULE_AUTHOR(SPL_META_AUTHOR); +MODULE_LICENSE(SPL_META_LICENSE); +MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE); diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c new file mode 100644 index 000000000..5492c6a46 --- /dev/null +++ b/module/spl/spl-kmem-cache.c @@ -0,0 +1,1769 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/shrinker.h> +#include <sys/taskq.h> +#include <sys/timer.h> +#include <sys/vmem.h> +#include <sys/wait.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/prefetch.h> + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + +/* + * Cache expiration was implemented because it was part of the default Solaris + * kmem_cache behavior. The idea is that per-cpu objects which haven't been + * accessed in several seconds should be returned to the cache. On the other + * hand Linux slabs never move objects back to the slabs unless there is + * memory pressure on the system. By default the Linux method is enabled + * because it has been shown to improve responsiveness on low memory systems. + * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; +EXPORT_SYMBOL(spl_kmem_cache_expire); +module_param(spl_kmem_cache_expire, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); + +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; +module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, + "Minimal number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +/* + * This value defaults to a threshold designed to avoid allocations which + * have been deemed costly by the kernel. + */ +unsigned int spl_kmem_cache_kmem_limit = + ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / + SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); +/* END CSTYLED */ + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data types which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); +SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, + spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); + } else { + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + } + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return (ptr); +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + free_pages((unsigned long)ptr, get_order(size)); + } else { + vfree(ptr); + } +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); +} + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each offslab object taking in to account alignment + * restrictions and the power-of-two requirement of kv_alloc(). + */ +static inline uint32_t +spl_offslab_size(spl_kmem_cache_t *skc) +{ + return (1UL << (fls64(spl_obj_size(skc)) + 1)); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * KMC_ONSLAB KMC_OFFSLAB + * + * +------------------------+ +-----------------+ + * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ + * | skc_obj_size <-+ | | +-----------------+ | | + * | spl_kmem_obj_t | | | | + * | skc_obj_size <---+ | +-----------------+ | | + * | spl_kmem_obj_t | | | skc_obj_size | <-+ | + * | ... v | | spl_kmem_obj_t | | + * +------------------------+ +-----------------+ v + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + void *base, *obj; + uint32_t obj_size, offslab_size = 0; + int i, rc = 0; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_OFFSLAB) + offslab_size = spl_offslab_size(skc); + + for (i = 0; i < sks->sks_objs; i++) { + if (skc->skc_flags & KMC_OFFSLAB) { + obj = kv_alloc(skc, offslab_size, flags); + if (!obj) { + rc = -ENOMEM; + goto out; + } + } else { + obj = base + spl_sks_size(skc) + (i * obj_size); + } + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + +out: + if (rc) { + if (skc->skc_flags & KMC_OFFSLAB) + list_for_each_entry_safe(sko, + n, &sks->sks_free_list, sko_list) { + kv_free(skc, sko->sko_addr, offslab_size); + } + + kv_free(skc, base, skc->skc_slab_size); + sks = NULL; + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Reclaim empty slabs at the end of the partial list. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + spl_kmem_obj_t *sko, *n; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + uint32_t size = 0; + + /* + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { + + if (sks->sks_ref > 0) + break; + + spl_slab_free(sks, &sks_list, &sko_list); + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are + * run, any offslab objects are freed, and the slabs themselves + * are freed. This is all done outside the skc->skc_lock since + * this allows the destructor to sleep, and allows us to perform + * a conditional reschedule when a freeing a large number of + * objects and slabs back to the system. + */ + if (skc->skc_flags & KMC_OFFSLAB) + size = spl_offslab_size(skc); + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + + if (skc->skc_flags & KMC_OFFSLAB) + kv_free(skc, sko->sko_addr, size); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < ske->ske_obj) + node = node->rb_left; + else if (address > ske->ske_obj) + node = node->rb_right; + else + return (ske); + } + + return (NULL); +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return (0); + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return (1); +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + gfp_t lflags = kmem_flags_convert(flags); + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof (*ske), lflags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = __get_free_pages(lflags, order); + if (ske->ske_obj == 0) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + free_pages(ske->ske_obj, order); + kfree(ske); + return (-EINVAL); + } + + *obj = (void *)ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (ske) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + return (-ENOENT); + + free_pages(ske->ske_obj, order); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + int i, count = MIN(flush, skm->skm_avail); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + for (i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof (void *) * skm->skm_avail); +} + +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + __spl_cache_flush(skc, skm, flush); + spin_unlock(&skc->skc_lock); +} + +static void +spl_magazine_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_cpu == smp_processor_id()); + ASSERT(irqs_disabled()); + + /* There are no available objects or they are too young to age out */ + if ((skm->skm_avail == 0) || + time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) + return; + + /* + * Because we're executing in interrupt context we may have + * interrupted the holder of this lock. To avoid a potential + * deadlock return if the lock is contended. + */ + if (!spin_trylock(&skc->skc_lock)) + return; + + __spl_cache_flush(skc, skm, skm->skm_refill); + spin_unlock(&skc->skc_lock); +} + +/* + * Called regularly to keep a downward pressure on the cache. + * + * Objects older than skc->skc_delay seconds in the per-cpu magazines will + * be returned to the caches. This is done to prevent idle magazines from + * holding memory which could be better used elsewhere. The delay is + * present to prevent thrashing the magazine. + * + * The newly released objects may result in empty partial slabs. Those + * slabs should be released to the system. Otherwise moving the objects + * out of the magazines is just wasted work. + */ +static void +spl_cache_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + taskqid_t id = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* Dynamically disabled at run time */ + if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) + return; + + atomic_inc(&skc->skc_ref); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + on_each_cpu(spl_magazine_age, skc, 1); + + spl_slab_reclaim(skc); + + while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { + id = taskq_dispatch_delay( + spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + /* Destroy issued after dispatch immediately cancel it */ + if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) + taskq_cancel_id(spl_kmem_cache_taskq, id); + } + + spin_lock(&skc->skc_lock); + skc->skc_taskqid = id; + spin_unlock(&skc->skc_lock); + + atomic_dec(&skc->skc_ref); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; + + if (skc->skc_flags & KMC_OFFSLAB) { + tgt_objs = spl_kmem_cache_obj_per_slab; + tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + + if ((skc->skc_flags & KMC_KMEM) && + (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) + return (-ENOSPC); + } else { + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); + + /* + * KMC_KMEM slabs are allocated by __get_free_pages() which + * rounds up to the nearest order. Knowing this the size + * should be rounded up to the next power of two with a hard + * maximum defined by the maximum allowed allocation order. + */ + if (skc->skc_flags & KMC_KMEM) { + max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; + tgt_size = MIN(max_size, + PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); + } + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } + } + + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; + + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_age = jiffies; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + kfree(skm); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return (0); + + skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * + num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_possible_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + kfree(skc->skc_mag); + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return; + + for_each_possible_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } + + kfree(skc->skc_mag); +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_NOTOUCH Disable cache object aging (unsupported) + * KMC_NODEBUG Disable debugging (unsupported) + * KMC_NOHASH Disable hashing (unsupported) + * KMC_QCACHE Disable qcache (unsupported) + * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab + * KMC_KMEM Force kmem backed cache + * KMC_VMEM Force vmem backed cache + * KMC_SLAB Force Linux slab backed cache + * KMC_OFFSLAB Locate objects off the slab + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + gfp_t lflags = kmem_flags_convert(KM_SLEEP); + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT0(flags & KMC_NOMAGAZINE); + ASSERT0(flags & KMC_NOHASH); + ASSERT0(flags & KMC_QCACHE); + ASSERT(vmp == NULL); + + might_sleep(); + + skc = kzalloc(sizeof (*skc), lflags); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + if (skc->skc_name == NULL) { + kfree(skc); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. By + * default this functionality is disabled until its + * performance characteristics are fully understood. + */ + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) + skc->skc_flags |= KMC_SLAB; + + /* + * Small objects, less than spl_kmem_cache_kmem_limit per + * object should use kmem because their slabs are small. + */ + else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) + skc->skc_flags |= KMC_KMEM; + + /* + * All other objects are considered large and are placed + * on vmem backed slabs. + */ + else + skc->skc_flags |= KMC_VMEM; + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + unsigned long slabflags = 0; + + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + +#if defined(SLAB_USERCOPY) + /* + * Required for PAX-enabled kernels if the slab is to be + * used for coping between user and kernel space. + */ + slabflags |= SLAB_USERCOPY; +#endif + +#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) + /* + * Newer grsec patchset uses kmem_cache_create_usercopy() + * instead of SLAB_USERCOPY flag + */ + skc->skc_linux_cache = kmem_cache_create_usercopy( + skc->skc_name, size, align, slabflags, 0, size, NULL); +#else + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, slabflags, NULL); +#endif + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif + skc->skc_flags |= KMC_NOMAGAZINE; + } + + if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) + skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, + spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kfree(skc->skc_name); + kfree(skc); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* + * Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. + */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + spin_unlock(&skc->skc_lock); + + kfree(skc->skc_name); + kfree(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return (sko->sko_addr); +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static int +__spl_cache_grow(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + + fstrans_cookie_t cookie = spl_fstrans_mark(); + sks = spl_slab_alloc(skc, flags); + spl_fstrans_unmark(cookie); + + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + + smp_mb__before_atomic(); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + } + spin_unlock(&skc->skc_lock); + + return (sks == NULL ? -ENOMEM : 0); +} + +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + + (void) __spl_cache_grow(skc, ska->ska_flags); + + atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * To reduce the overhead of context switch and improve NUMA locality, + * it tries to allocate a new slab in the current process context with + * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the + * allocation. + * + * However, this can't be applied to KVM_VMEM due to a bug that + * __vmalloc() doesn't honor gfp flags in page table allocation. + */ + if (!(skc->skc_flags & KMC_VMEM)) { + rc = __spl_cache_grow(skc, flags | KM_NOSLEEP); + if (rc == 0) + return (0); + } + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); + if (ska == NULL) { + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ / 10); + + if (!remaining) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* + * Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. + */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* + * Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. + */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + do { + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + goto ret; + } + + local_irq_disable(); + +restart: + /* + * Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + skm->skm_age = jiffies; + } else { + obj = spl_cache_refill(skc, skm, flags); + if ((obj == NULL) && !(flags & KM_NOSLEEP)) + goto restart; + + local_irq_enable(); + goto ret; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + return (obj); +} +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + int do_reclaim = 0; + int do_emergency = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + return; + } + + /* + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. + */ + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + return; + } + + local_irq_save(flags); + + /* + * Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { + spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * The generic shrinker function for all caches. Under Linux a shrinker + * may not be tightly coupled with a slab cache. In fact Linux always + * systematically tries calling all registered shrinker callbacks which + * report that they contain unused objects. Because of this we only + * register one shrinker function in the shim layer for all slab caches. + * We always attempt to shrink all caches when this generic shrinker + * is called. + * + * If sc->nr_to_scan is zero, the caller is requesting a query of the + * number of objects which can potentially be freed. If it is nonzero, + * the request is to free that many objects. + * + * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks + * in struct shrinker and also require the shrinker to return the number + * of objects freed. + * + * Older kernels require the shrinker to return the number of freeable + * objects following the freeing of nr_to_free. + * + * Linux semantics differ from those under Solaris, which are to + * free all available objects which may (and probably will) be more + * objects than the requested nr_to_scan. + */ +static spl_shrinker_t +__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + spl_kmem_cache_t *skc; + int alloc = 0; + + /* + * No shrinking in a transaction context. Can cause deadlocks. + */ + if (sc->nr_to_scan && spl_fstrans_check()) + return (SHRINK_STOP); + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (sc->nr_to_scan) { +#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + uint64_t oldalloc = skc->skc_obj_alloc; + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + if (oldalloc > skc->skc_obj_alloc) + alloc += oldalloc - skc->skc_obj_alloc; +#else + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + alloc += skc->skc_obj_alloc; +#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ + } else { + /* Request to query number of freeable objects */ + alloc += skc->skc_obj_alloc; + } + } + up_read(&spl_kmem_cache_sem); + + /* + * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. + * This functionality only exists to work around a rare issue where + * shrink_slabs() is repeatedly invoked by many cores causing the + * system to thrash. + */ + if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) + return (SHRINK_STOP); + + return (MAX(alloc, 0)); +} + +SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); + +/* + * Call the registered reclaim function for a cache. Depending on how + * many and which objects are released it may simply repopulate the + * local magazine which will then need to age-out. Objects which cannot + * fit in the magazine we will be released back to their slabs which will + * also need to age out before being release. This is all just best + * effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + goto out; + } + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* + * When a reclaim function is available it may be invoked repeatedly + * until at least a single slab can be freed. This ensures that we + * do free memory back to the system. This helps minimize the chance + * of an OOM event when the bulk of memory is used by the slab. + * + * When free slabs are already available the reclaim callback will be + * skipped. Additionally, if no forward progress is detected despite + * a reclaim function the cache will be skipped to avoid deadlock. + * + * Longer term this would be the correct place to add the code which + * repacks the slabs in order minimize fragmentation. + */ + if (skc->skc_reclaim) { + uint64_t objects = UINT64_MAX; + int do_reclaim; + + do { + spin_lock(&skc->skc_lock); + do_reclaim = + (skc->skc_slab_total > 0) && + ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && + (skc->skc_obj_alloc < objects); + + objects = skc->skc_obj_alloc; + spin_unlock(&skc->skc_lock); + + if (do_reclaim) + skc->skc_reclaim(skc->skc_private); + + } while (do_reclaim); + } + + /* Reclaim from the magazine and free all now empty slabs. */ + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { + spl_kmem_magazine_t *skm; + unsigned long irq_flags; + + local_irq_save(irq_flags); + skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + } + + spl_slab_reclaim(skc); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + struct shrink_control sc; + + sc.nr_to_scan = KMC_REAP_CHUNK; + sc.gfp_mask = GFP_KERNEL; + + (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + spl_kmem_cache_kmem_threads, maxclsyspri, + spl_kmem_cache_kmem_threads * 8, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + spl_register_shrinker(&spl_kmem_cache_shrinker); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + spl_unregister_shrinker(&spl_kmem_cache_shrinker); + taskq_destroy(spl_kmem_cache_taskq); +} diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c new file mode 100644 index 000000000..e0d551041 --- /dev/null +++ b/module/spl/spl-kmem.c @@ -0,0 +1,567 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/vmem.h> +#include <linux/mm.h> +#include <linux/ratelimit.h> + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to sixteen pages but capped at 64K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); +/* END CSTYLED */ + +int +kmem_debugging(void) +{ + return (0); +} +EXPORT_SYMBOL(kmem_debugging); + +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr; + + do { + va_copy(aq, ap); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); + va_end(aq); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_vasprintf); + +char * +kmem_asprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_asprintf); + +static char * +__strdup(const char *str, int flags) +{ + char *ptr; + int n; + + n = strlen(str); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); + if (ptr) + memcpy(ptr, str, n + 1); + + return (ptr); +} + +char * +strdup(const char *str) +{ + return (__strdup(str, KM_SLEEP)); +} +EXPORT_SYMBOL(strdup); + +void +strfree(char *str) +{ + kfree(str); +} +EXPORT_SYMBOL(strfree); + +/* + * Limit the number of large allocation stack traces dumped to not more than + * 5 every 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + int use_vmem = 0; + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if ((size > spl_kmem_alloc_max) || use_vmem) { + if (flags & KM_VMEM) { + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + /* + * For vmem_alloc() and vmem_zalloc() callers retry immediately + * using __vmalloc() which is unlikely to fail. + */ + if ((flags & KM_VMEM) && (use_vmem == 0)) { + use_vmem = 1; + continue; + } + + if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) { + printk(KERN_WARNING + "Possible memory allocation deadlock: " + "size=%lu lflags=0x%x", + (unsigned long)size, lflags); + dump_stack(); + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem + */ +#ifdef DEBUG_KMEM + +/* Shim layer memory accounting */ +#ifdef HAVE_ATOMIC64_T +atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); +unsigned long long kmem_alloc_max = 0; +#else /* HAVE_ATOMIC64_T */ +atomic_t kmem_alloc_used = ATOMIC_INIT(0); +unsigned long long kmem_alloc_max = 0; +#endif /* HAVE_ATOMIC64_T */ + +EXPORT_SYMBOL(kmem_alloc_used); +EXPORT_SYMBOL(kmem_alloc_max); + +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked + * but also the location of every alloc and free. When the SPL module is + * unloaded a list of all leaked addresses and where they were allocated + * will be dumped to the console. Enabling this feature has a significant + * impact on performance but it makes finding memory leaks straight forward. + * + * Not surprisingly with debugging enabled the xmem_locks are very highly + * contended particularly on xfree(). If we want to run with this detailed + * debugging enabled for anything other than debugging we need to minimize + * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking + */ +#ifdef DEBUG_KMEM_TRACKING + +#include <linux/hash.h> +#include <linux/ctype.h> + +#define KMEM_HASH_BITS 10 +#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) + +typedef struct kmem_debug { + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ +} kmem_debug_t; + +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; + +static kmem_debug_t * +kmem_del_init(spinlock_t *lock, struct hlist_head *table, + int bits, const void *addr) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kmem_debug *p; + unsigned long flags; + + spin_lock_irqsave(lock, flags); + + head = &table[hash_ptr((void *)addr, bits)]; + hlist_for_each(node, head) { + p = list_entry(node, struct kmem_debug, kd_hlist); + if (p->kd_addr == addr) { + hlist_del_init(&p->kd_hlist); + list_del_init(&p->kd_list); + spin_unlock_irqrestore(lock, flags); + return (p); + } + } + + spin_unlock_irqrestore(lock, flags); + + return (NULL); +} + +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) +{ + void *ptr = NULL; + kmem_debug_t *dptr; + unsigned long irq_flags; + + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); + + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } + + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } + + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); + + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; + + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); + + return (ptr); +} + +inline void +spl_kmem_free_track(const void *ptr, size_t size) +{ + kmem_debug_t *dptr; + + /* Ignore NULL pointer since we haven't tracked it at all */ + if (ptr == NULL) + return; + + /* Must exist in hash due to kmem_alloc() */ + dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); + + kfree(dptr->kd_func); + kfree(dptr); + + spl_kmem_free_debug(ptr, size); +} +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ +void * +spl_kmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); + +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_ZERO; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_zalloc); + +void +spl_kmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_kmem_free); + +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) +{ + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; + + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); + + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ + if (i > min) + break; + + flag = 0; + break; + } + } + + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } + + return (str); +} + +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) +{ + int i; + + spin_lock_init(lock); + INIT_LIST_HEAD(list); + + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); + + return (0); +} + +static void +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +{ + unsigned long flags; + kmem_debug_t *kd; + char str[17]; + + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); + + list_for_each_entry(kd, list, kd_list) { + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + } + + spin_unlock_irqrestore(lock, flags); +} +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ + +int +spl_kmem_init(void) +{ +#ifdef DEBUG_KMEM + kmem_alloc_used_set(0); + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + + return (0); +} + +void +spl_kmem_fini(void) +{ +#ifdef DEBUG_KMEM + /* + * Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. + */ + if (kmem_alloc_used_read() != 0) + printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +} diff --git a/module/spl/spl-kobj.c b/module/spl/spl-kobj.c new file mode 100644 index 000000000..7019369bd --- /dev/null +++ b/module/spl/spl-kobj.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Kobj Implementation. + */ + +#include <sys/kobj.h> + +struct _buf * +kobj_open_file(const char *name) +{ + struct _buf *file; + vnode_t *vp; + int rc; + + file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP)); + if (file == NULL) + return ((_buf_t *)-1UL); + + if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) { + kfree(file); + return ((_buf_t *)-1UL); + } + + file->vp = vp; + + return (file); +} /* kobj_open_file() */ +EXPORT_SYMBOL(kobj_open_file); + +void +kobj_close_file(struct _buf *file) +{ + VOP_CLOSE(file->vp, 0, 0, 0, 0, 0); + kfree(file); +} /* kobj_close_file() */ +EXPORT_SYMBOL(kobj_close_file); + +int +kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) +{ + ssize_t resid; + + if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off, + UIO_SYSSPACE, 0, 0, 0, &resid) != 0) + return (-1); + + return (size - resid); +} /* kobj_read_file() */ +EXPORT_SYMBOL(kobj_read_file); + +int +kobj_get_filesize(struct _buf *file, uint64_t *size) +{ + vattr_t vap; + int rc; + + rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL); + if (rc) + return (rc); + + *size = vap.va_size; + + return (rc); +} /* kobj_get_filesize() */ +EXPORT_SYMBOL(kobj_get_filesize); diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c new file mode 100644 index 000000000..bcbff94a6 --- /dev/null +++ b/module/spl/spl-kstat.c @@ -0,0 +1,733 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Kstat Implementation. + */ + +#include <linux/seq_file.h> +#include <sys/kstat.h> +#include <sys/vmem.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> + +#ifndef HAVE_PDE_DATA +#define PDE_DATA(x) (PDE(x)->data) +#endif + +static kmutex_t kstat_module_lock; +static struct list_head kstat_module_list; +static kid_t kstat_id; + +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + + return (0); +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} +EXPORT_SYMBOL(kstat_waitq_enter); + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} +EXPORT_SYMBOL(kstat_waitq_exit); + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} +EXPORT_SYMBOL(kstat_runq_enter); + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} +EXPORT_SYMBOL(kstat_runq_exit); + +static int +kstat_seq_show_headers(struct seq_file *f) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", + ksp->ks_kid, ksp->ks_type, ksp->ks_flags, + ksp->ks_ndata, (int)ksp->ks_data_size, + ksp->ks_crtime, ksp->ks_snaptime); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + seq_printf(f, "raw data\n"); + } + break; + case KSTAT_TYPE_NAMED: + seq_printf(f, "%-31s %-4s %s\n", + "name", "type", "data"); + break; + case KSTAT_TYPE_INTR: + seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", + "hard", "soft", "watchdog", + "spurious", "multsvc"); + break; + case KSTAT_TYPE_IO: + seq_printf(f, + "%-8s %-8s %-8s %-8s %-8s %-8s " + "%-8s %-8s %-8s %-8s %-8s %-8s\n", + "nread", "nwritten", "reads", "writes", + "wtime", "wlentime", "wupdate", + "rtime", "rlentime", "rupdate", + "wcnt", "rcnt"); + break; + case KSTAT_TYPE_TIMER: + seq_printf(f, + "%-31s %-8s " + "%-8s %-8s %-8s %-8s %-8s\n", + "name", "events", "elapsed", + "min", "max", "start", "stop"); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) +{ + int i, j; + + for (i = 0; ; i++) { + seq_printf(f, "%03x:", i); + + for (j = 0; j < 16; j++) { + if (i * 16 + j >= l) { + seq_printf(f, "\n"); + goto out; + } + + seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); + } + seq_printf(f, "\n"); + } +out: + return (0); +} + +static int +kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) +{ + seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); + + switch (knp->data_type) { + case KSTAT_DATA_CHAR: + knp->value.c[15] = '\0'; /* NULL terminate */ + seq_printf(f, "%-16s", knp->value.c); + break; + /* + * NOTE - We need to be more careful able what tokens are + * used for each arch, for now this is correct for x86_64. + */ + case KSTAT_DATA_INT32: + seq_printf(f, "%d", knp->value.i32); + break; + case KSTAT_DATA_UINT32: + seq_printf(f, "%u", knp->value.ui32); + break; + case KSTAT_DATA_INT64: + seq_printf(f, "%lld", (signed long long)knp->value.i64); + break; + case KSTAT_DATA_UINT64: + seq_printf(f, "%llu", + (unsigned long long)knp->value.ui64); + break; + case KSTAT_DATA_LONG: + seq_printf(f, "%ld", knp->value.l); + break; + case KSTAT_DATA_ULONG: + seq_printf(f, "%lu", knp->value.ul); + break; + case KSTAT_DATA_STRING: + KSTAT_NAMED_STR_PTR(knp) + [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; + seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); + break; + default: + PANIC("Undefined kstat data type %d\n", knp->data_type); + } + + seq_printf(f, "\n"); + + return (0); +} + +static int +kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) +{ + seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", + kip->intrs[KSTAT_INTR_HARD], + kip->intrs[KSTAT_INTR_SOFT], + kip->intrs[KSTAT_INTR_WATCHDOG], + kip->intrs[KSTAT_INTR_SPURIOUS], + kip->intrs[KSTAT_INTR_MULTSVC]); + + return (0); +} + +static int +kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) +{ + seq_printf(f, + "%-8llu %-8llu %-8u %-8u %-8lld %-8lld " + "%-8lld %-8lld %-8lld %-8lld %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + + return (0); +} + +static int +kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) +{ + seq_printf(f, + "%-31s %-8llu %-8lld %-8lld %-8lld %-8lld %-8lld\n", + ktp->name, ktp->num_events, ktp->elapsed_time, + ktp->min_time, ktp->max_time, + ktp->start_time, ktp->stop_time); + + return (0); +} + +static int +kstat_seq_show(struct seq_file *f, void *p) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data( + ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + ASSERT(ksp->ks_ndata == 1); + rc = kstat_seq_show_raw(f, ksp->ks_data, + ksp->ks_data_size); + } + break; + case KSTAT_TYPE_NAMED: + rc = kstat_seq_show_named(f, (kstat_named_t *)p); + break; + case KSTAT_TYPE_INTR: + rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); + break; + case KSTAT_TYPE_IO: + rc = kstat_seq_show_io(f, (kstat_io_t *)p); + break; + case KSTAT_TYPE_TIMER: + rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +static void * +kstat_seq_data_addr(kstat_t *ksp, loff_t n) +{ + void *rc = NULL; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + if (ksp->ks_raw_ops.addr) + rc = ksp->ks_raw_ops.addr(ksp, n); + else + rc = ksp->ks_data; + break; + case KSTAT_TYPE_NAMED: + rc = ksp->ks_data + n * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + rc = ksp->ks_data + n * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + rc = ksp->ks_data + n * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + rc = ksp->ks_data + n * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (rc); +} + +static void * +kstat_seq_start(struct seq_file *f, loff_t *pos) +{ + loff_t n = *pos; + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + + if (ksp->ks_type == KSTAT_TYPE_RAW) { + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + } + + /* Dynamically update kstat, on error existing kstats are used */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_snaptime = gethrtime(); + + if (!n && kstat_seq_show_headers(f)) + return (NULL); + + if (n >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, n)); +} + +static void * +kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + ++*pos; + if (*pos >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, *pos)); +} + +static void +kstat_seq_stop(struct seq_file *f, void *v) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + if (ksp->ks_type == KSTAT_TYPE_RAW) + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + + mutex_exit(ksp->ks_lock); +} + +static struct seq_operations kstat_seq_ops = { + .show = kstat_seq_show, + .start = kstat_seq_start, + .next = kstat_seq_next, + .stop = kstat_seq_stop, +}; + +static kstat_module_t * +kstat_find_module(char *name) +{ + kstat_module_t *module; + + list_for_each_entry(module, &kstat_module_list, ksm_module_list) { + if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) + return (module); + } + + return (NULL); +} + +static kstat_module_t * +kstat_create_module(char *name) +{ + kstat_module_t *module; + struct proc_dir_entry *pde; + + pde = proc_mkdir(name, proc_spl_kstat); + if (pde == NULL) + return (NULL); + + module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); + module->ksm_proc = pde; + strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); + INIT_LIST_HEAD(&module->ksm_kstat_list); + list_add_tail(&module->ksm_module_list, &kstat_module_list); + + return (module); + +} + +static void +kstat_delete_module(kstat_module_t *module) +{ + ASSERT(list_empty(&module->ksm_kstat_list)); + remove_proc_entry(module->ksm_name, proc_spl_kstat); + list_del(&module->ksm_module_list); + kmem_free(module, sizeof (kstat_module_t)); +} + +static int +proc_kstat_open(struct inode *inode, struct file *filp) +{ + struct seq_file *f; + int rc; + + rc = seq_open(filp, &kstat_seq_ops); + if (rc) + return (rc); + + f = filp->private_data; + f->private = PDE_DATA(inode); + + return (rc); +} + +static ssize_t +proc_kstat_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + kstat_t *ksp = f->private; + int rc; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + rc = ksp->ks_update(ksp, KSTAT_WRITE); + mutex_exit(ksp->ks_lock); + + if (rc) + return (-rc); + + *ppos += len; + return (len); +} + +static struct file_operations proc_kstat_operations = { + .open = proc_kstat_open, + .write = proc_kstat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} +EXPORT_SYMBOL(__kstat_set_raw_ops); + +kstat_t * +__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, uint_t ks_ndata, + uchar_t ks_flags) +{ + kstat_t *ksp; + + ASSERT(ks_module); + ASSERT(ks_instance == 0); + ASSERT(ks_name); + ASSERT(!(ks_flags & KSTAT_FLAG_UNSUPPORTED)); + + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); + if (ksp == NULL) + return (ksp); + + mutex_enter(&kstat_module_lock); + ksp->ks_kid = kstat_id; + kstat_id++; + mutex_exit(&kstat_module_lock); + + ksp->ks_magic = KS_MAGIC; + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + INIT_LIST_HEAD(&ksp->ks_list); + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN); + ksp->ks_instance = ks_instance; + strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN); + strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = ks_flags; + ksp->ks_update = kstat_default_update; + ksp->ks_private = NULL; + ksp->ks_raw_ops.headers = NULL; + ksp->ks_raw_ops.data = NULL; + ksp->ks_raw_ops.addr = NULL; + ksp->ks_raw_buf = NULL; + ksp->ks_raw_bufsize = 0; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + + return (ksp); +} +EXPORT_SYMBOL(__kstat_create); + +static int +kstat_detect_collision(kstat_t *ksp) +{ + kstat_module_t *module; + kstat_t *tmp; + char *parent; + char *cp; + + parent = kmem_asprintf("%s", ksp->ks_module); + + if ((cp = strrchr(parent, '/')) == NULL) { + strfree(parent); + return (0); + } + + cp[0] = '\0'; + if ((module = kstat_find_module(parent)) != NULL) { + list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) { + if (strncmp(tmp->ks_name, cp+1, KSTAT_STRLEN) == 0) { + strfree(parent); + return (EEXIST); + } + } + } + + strfree(parent); + return (0); +} + +void +__kstat_install(kstat_t *ksp) +{ + kstat_module_t *module; + kstat_t *tmp; + + ASSERT(ksp); + + mutex_enter(&kstat_module_lock); + + module = kstat_find_module(ksp->ks_module); + if (module == NULL) { + if (kstat_detect_collision(ksp) != 0) { + cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ + " collision", ksp->ks_module, ksp->ks_name); + goto out; + } + module = kstat_create_module(ksp->ks_module); + if (module == NULL) + goto out; + } + + /* + * Only one entry by this name per-module, on failure the module + * shouldn't be deleted because we know it has at least one entry. + */ + list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) { + if (strncmp(tmp->ks_name, ksp->ks_name, KSTAT_STRLEN) == 0) + goto out; + } + + list_add_tail(&ksp->ks_list, &module->ksm_kstat_list); + + mutex_enter(ksp->ks_lock); + ksp->ks_owner = module; + ksp->ks_proc = proc_create_data(ksp->ks_name, 0644, + module->ksm_proc, &proc_kstat_operations, (void *)ksp); + if (ksp->ks_proc == NULL) { + list_del_init(&ksp->ks_list); + if (list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + } + mutex_exit(ksp->ks_lock); +out: + mutex_exit(&kstat_module_lock); +} +EXPORT_SYMBOL(__kstat_install); + +void +__kstat_delete(kstat_t *ksp) +{ + kstat_module_t *module = ksp->ks_owner; + + mutex_enter(&kstat_module_lock); + list_del_init(&ksp->ks_list); + mutex_exit(&kstat_module_lock); + + if (ksp->ks_proc) { + remove_proc_entry(ksp->ks_name, module->ksm_proc); + + /* Remove top level module directory if it's empty */ + if (list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + } + + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); + + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); + kmem_free(ksp, sizeof (*ksp)); +} +EXPORT_SYMBOL(__kstat_delete); + +int +spl_kstat_init(void) +{ + mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&kstat_module_list); + kstat_id = 0; + return (0); +} + +void +spl_kstat_fini(void) +{ + ASSERT(list_empty(&kstat_module_list)); + mutex_destroy(&kstat_module_lock); +} diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c new file mode 100644 index 000000000..ba818862b --- /dev/null +++ b/module/spl/spl-mutex.c @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Mutex Implementation. + */ + +#include <sys/mutex.h> + +int spl_mutex_init(void) { return 0; } +void spl_mutex_fini(void) { } diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c new file mode 100644 index 000000000..9c52924a4 --- /dev/null +++ b/module/spl/spl-proc.c @@ -0,0 +1,782 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Proc Implementation. + */ + +#include <sys/systeminfo.h> +#include <sys/kstat.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/vmem.h> +#include <sys/taskq.h> +#include <sys/proc.h> +#include <linux/ctype.h> +#include <linux/kmod.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/version.h> + +#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +typedef struct ctl_table __no_const spl_ctl_table; +#else +typedef struct ctl_table spl_ctl_table; +#endif + +static unsigned long table_min = 0; +static unsigned long table_max = ~0; + +static struct ctl_table_header *spl_header = NULL; +static struct proc_dir_entry *proc_spl = NULL; +static struct proc_dir_entry *proc_spl_kmem = NULL; +static struct proc_dir_entry *proc_spl_kmem_slab = NULL; +static struct proc_dir_entry *proc_spl_taskq_all = NULL; +static struct proc_dir_entry *proc_spl_taskq = NULL; +struct proc_dir_entry *proc_spl_kstat = NULL; + +static int +proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer, + int ubuffer_size) +{ + int size; + + if (ubuffer_size > kbuffer_size) + return (-EOVERFLOW); + + if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) + return (-EFAULT); + + /* strip trailing whitespace */ + size = strnlen(kbuffer, ubuffer_size); + while (size-- >= 0) + if (!isspace(kbuffer[size])) + break; + + /* empty string */ + if (size < 0) + return (-EINVAL); + + /* no space to terminate */ + if (size == kbuffer_size) + return (-EOVERFLOW); + + kbuffer[size + 1] = 0; + return (0); +} + +static int +proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer, + char *append) +{ + /* + * NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and + * (i.e. a terminating zero byte) for sysctl entries + */ + int size = MIN(strlen(kbuffer), ubuffer_size); + + if (copy_to_user(ubuffer, kbuffer, size)) + return (-EFAULT); + + if (append != NULL && size < ubuffer_size) { + if (copy_to_user(ubuffer + size, append, 1)) + return (-EFAULT); + + size++; + } + + return (size); +} + +#ifdef DEBUG_KMEM +static int +proc_domemused(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val; + spl_ctl_table dummy = *table; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { +#ifdef HAVE_ATOMIC64_T + val = atomic64_read((atomic64_t *)table->data); +#else + val = atomic_read((atomic_t *)table->data); +#endif /* HAVE_ATOMIC64_T */ + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} +#endif /* DEBUG_KMEM */ + +static int +proc_doslab(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val = 0, mask; + spl_ctl_table dummy = *table; + spl_kmem_cache_t *skc; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { + down_read(&spl_kmem_cache_sem); + mask = (unsigned long)table->data; + + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + + /* Only use slabs of the correct kmem/vmem type */ + if (!(skc->skc_flags & mask)) + continue; + + /* Sum the specified field for selected slabs */ + switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { + case KMC_TOTAL: + val += skc->skc_slab_size * skc->skc_slab_total; + break; + case KMC_ALLOC: + val += skc->skc_obj_size * skc->skc_obj_alloc; + break; + case KMC_MAX: + val += skc->skc_obj_size * skc->skc_obj_max; + break; + } + } + + up_read(&spl_kmem_cache_sem); + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} + +static int +proc_dohostid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int len, rc = 0; + char *end, str[32]; + + if (write) { + /* + * We can't use proc_doulongvec_minmax() in the write + * case here because hostid while a hex value has no + * leading 0x which confuses the helper function. + */ + rc = proc_copyin_string(str, sizeof (str), buffer, *lenp); + if (rc < 0) + return (rc); + + spl_hostid = simple_strtoul(str, &end, 16); + if (str == end) + return (-EINVAL); + + } else { + len = snprintf(str, sizeof (str), "%lx", + (unsigned long) zone_get_hostid(NULL)); + if (*ppos >= len) + rc = 0; + else + rc = proc_copyout_string(buffer, + *lenp, str + *ppos, "\n"); + + if (rc >= 0) { + *lenp = rc; + *ppos += rc; + } + } + + return (rc); +} + +static void +taskq_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", + "taskq", "act", "nthr", "spwn", "maxt", "pri", + "mina", "maxa", "cura", "flags"); +} + +/* indices into the lheads array below */ +#define LHEAD_PEND 0 +#define LHEAD_PRIO 1 +#define LHEAD_DELAY 2 +#define LHEAD_WAIT 3 +#define LHEAD_ACTIVE 4 +#define LHEAD_SIZE 5 + +/* BEGIN CSTYLED */ +static unsigned int spl_max_show_tasks = 512; +module_param(spl_max_show_tasks, uint, 0644); +MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); +/* END CSTYLED */ + +static int +taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) +{ + taskq_t *tq = p; + taskq_thread_t *tqt; + spl_wait_queue_entry_t *wq; + struct task_struct *tsk; + taskq_ent_t *tqe; + char name[100]; + struct list_head *lheads[LHEAD_SIZE], *lh; + static char *list_names[LHEAD_SIZE] = + {"pend", "prio", "delay", "wait", "active" }; + int i, j, have_lheads = 0; + unsigned long wflags, flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); + + /* get the various lists and check whether they're empty */ + lheads[LHEAD_PEND] = &tq->tq_pend_list; + lheads[LHEAD_PRIO] = &tq->tq_prio_list; + lheads[LHEAD_DELAY] = &tq->tq_delay_list; +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; +#else + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; +#endif + lheads[LHEAD_ACTIVE] = &tq->tq_active_list; + + for (i = 0; i < LHEAD_SIZE; ++i) { + if (list_empty(lheads[i])) + lheads[i] = NULL; + else + ++have_lheads; + } + + /* early return in non-"all" mode if lists are all empty */ + if (!allflag && !have_lheads) { + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); + } + + /* unlock the waitq quickly */ + if (!lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + + /* show the base taskq contents */ + snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); + seq_printf(f, "%-25s ", name); + seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", + tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, + tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, + tq->tq_nalloc, tq->tq_flags); + + /* show the active list */ + if (lheads[LHEAD_ACTIVE]) { + j = 0; + list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { + if (j == 0) + seq_printf(f, "\t%s:", + list_names[LHEAD_ACTIVE]); + else if (j == 2) { + seq_printf(f, "\n\t "); + j = 0; + } + seq_printf(f, " [%d]%pf(%ps)", + tqt->tqt_thread->pid, + tqt->tqt_task->tqent_func, + tqt->tqt_task->tqent_arg); + ++j; + } + seq_printf(f, "\n"); + } + + for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) + if (lheads[i]) { + j = 0; + list_for_each(lh, lheads[i]) { + if (spl_max_show_tasks != 0 && + j >= spl_max_show_tasks) { + seq_printf(f, "\n\t(truncated)"); + break; + } + /* show the wait waitq list */ + if (i == LHEAD_WAIT) { +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + wq = list_entry(lh, + spl_wait_queue_entry_t, entry); +#else + wq = list_entry(lh, + spl_wait_queue_entry_t, task_list); +#endif + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 8 == 0) + seq_printf(f, "\n\t "); + + tsk = wq->private; + seq_printf(f, " %d", tsk->pid); + /* pend, prio and delay lists */ + } else { + tqe = list_entry(lh, taskq_ent_t, + tqent_list); + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 2 == 0) + seq_printf(f, "\n\t "); + + seq_printf(f, " %pf(%ps)", + tqe->tqent_func, + tqe->tqent_arg); + } + ++j; + } + seq_printf(f, "\n"); + } + if (lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (0); +} + +static int +taskq_all_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_TRUE)); +} + +static int +taskq_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_FALSE)); +} + +static void * +taskq_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&tq_list_sem); + if (!n) + taskq_seq_show_headers(f); + + p = tq_list.next; + while (n--) { + p = p->next; + if (p == &tq_list) + return (NULL); + } + + return (list_entry(p, taskq_t, tq_taskqs)); +} + +static void * +taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + taskq_t *tq = p; + + ++*pos; + return ((tq->tq_taskqs.next == &tq_list) ? + NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); +} + +static void +slab_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, + "--------------------- cache ----------" + "--------------------------------------------- " + "----- slab ------ " + "---- object ----- " + "--- emergency ---\n"); + seq_printf(f, + "name " + " flags size alloc slabsize objsize " + "total alloc max " + "total alloc max " + "dlock alloc max\n"); +} + +static int +slab_seq_show(struct seq_file *f, void *p) +{ + spl_kmem_cache_t *skc = p; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Backed by Linux slab see /proc/slabinfo. + */ + if (skc->skc_flags & KMC_SLAB) + return (0); + + spin_lock(&skc->skc_lock); + seq_printf(f, "%-36s ", skc->skc_name); + seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " + "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", + (long unsigned)skc->skc_flags, + (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), + (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), + (unsigned)skc->skc_slab_size, + (unsigned)skc->skc_obj_size, + (long unsigned)skc->skc_slab_total, + (long unsigned)skc->skc_slab_alloc, + (long unsigned)skc->skc_slab_max, + (long unsigned)skc->skc_obj_total, + (long unsigned)skc->skc_obj_alloc, + (long unsigned)skc->skc_obj_max, + (long unsigned)skc->skc_obj_deadlock, + (long unsigned)skc->skc_obj_emergency, + (long unsigned)skc->skc_obj_emergency_max); + + spin_unlock(&skc->skc_lock); + + return (0); +} + +static void * +slab_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&spl_kmem_cache_sem); + if (!n) + slab_seq_show_headers(f); + + p = spl_kmem_cache_list.next; + while (n--) { + p = p->next; + if (p == &spl_kmem_cache_list) + return (NULL); + } + + return (list_entry(p, spl_kmem_cache_t, skc_list)); +} + +static void * +slab_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + spl_kmem_cache_t *skc = p; + + ++*pos; + return ((skc->skc_list.next == &spl_kmem_cache_list) ? + NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list)); +} + +static void +slab_seq_stop(struct seq_file *f, void *v) +{ + up_read(&spl_kmem_cache_sem); +} + +static struct seq_operations slab_seq_ops = { + .show = slab_seq_show, + .start = slab_seq_start, + .next = slab_seq_next, + .stop = slab_seq_stop, +}; + +static int +proc_slab_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &slab_seq_ops)); +} + +static struct file_operations proc_slab_operations = { + .open = proc_slab_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void +taskq_seq_stop(struct seq_file *f, void *v) +{ + up_read(&tq_list_sem); +} + +static struct seq_operations taskq_all_seq_ops = { + .show = taskq_all_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static struct seq_operations taskq_seq_ops = { + .show = taskq_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static int +proc_taskq_all_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_all_seq_ops)); +} + +static int +proc_taskq_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_seq_ops)); +} + +static struct file_operations proc_taskq_all_operations = { + .open = proc_taskq_all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations proc_taskq_operations = { + .open = proc_taskq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct ctl_table spl_kmem_table[] = { +#ifdef DEBUG_KMEM + { + .procname = "kmem_used", + .data = &kmem_alloc_used, +#ifdef HAVE_ATOMIC64_T + .maxlen = sizeof (atomic64_t), +#else + .maxlen = sizeof (atomic_t), +#endif /* HAVE_ATOMIC64_T */ + .mode = 0444, + .proc_handler = &proc_domemused, + }, + { + .procname = "kmem_max", + .data = &kmem_alloc_max, + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif /* DEBUG_KMEM */ + { + .procname = "slab_kmem_total", + .data = (void *)(KMC_KMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kmem_alloc", + .data = (void *)(KMC_KMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kmem_max", + .data = (void *)(KMC_KMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_total", + .data = (void *)(KMC_VMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_alloc", + .data = (void *)(KMC_VMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_max", + .data = (void *)(KMC_VMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + {}, +}; + +static struct ctl_table spl_kstat_table[] = { + {}, +}; + +static struct ctl_table spl_table[] = { + /* + * NB No .strategy entries have been provided since + * sysctl(8) prefers to go via /proc for portability. + */ + { + .procname = "version", + .data = spl_version, + .maxlen = sizeof (spl_version), + .mode = 0444, + .proc_handler = &proc_dostring, + }, + { + .procname = "hostid", + .data = &spl_hostid, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dohostid, + }, + { + .procname = "kmem", + .mode = 0555, + .child = spl_kmem_table, + }, + { + .procname = "kstat", + .mode = 0555, + .child = spl_kstat_table, + }, + {}, +}; + +static struct ctl_table spl_dir[] = { + { + .procname = "spl", + .mode = 0555, + .child = spl_table, + }, + {} +}; + +static struct ctl_table spl_root[] = { + { +#ifdef HAVE_CTL_NAME + .ctl_name = CTL_KERN, +#endif + .procname = "kernel", + .mode = 0555, + .child = spl_dir, + }, + {} +}; + +int +spl_proc_init(void) +{ + int rc = 0; + + spl_header = register_sysctl_table(spl_root); + if (spl_header == NULL) + return (-EUNATCH); + + proc_spl = proc_mkdir("spl", NULL); + if (proc_spl == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, + &proc_taskq_all_operations, NULL); + if (proc_spl_taskq_all == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, + &proc_taskq_operations, NULL); + if (proc_spl_taskq == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem = proc_mkdir("kmem", proc_spl); + if (proc_spl_kmem == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem, + &proc_slab_operations, NULL); + if (proc_spl_kmem_slab == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kstat = proc_mkdir("kstat", proc_spl); + if (proc_spl_kstat == NULL) { + rc = -EUNATCH; + goto out; + } +out: + if (rc) { + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + unregister_sysctl_table(spl_header); + } + + return (rc); +} + +void +spl_proc_fini(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + + ASSERT(spl_header != NULL); + unregister_sysctl_table(spl_header); +} diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c new file mode 100644 index 000000000..9a992cc3a --- /dev/null +++ b/module/spl/spl-rwlock.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation. + */ + +#include <sys/rwlock.h> + +#if defined(CONFIG_PREEMPT_RT_FULL) + +#include <linux/rtmutex.h> +#define RT_MUTEX_OWNER_MASKALL 1UL + +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + + ASSERT((struct task_struct *) + ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) == + current); + + /* + * Under the realtime patch series, rwsem is implemented as a + * single mutex held by readers and writers alike. However, + * this implementation would prevent a thread from taking a + * read lock twice, as the mutex would already be locked on + * the second attempt. Therefore the implementation allows a + * single thread to take a rwsem as read lock multiple times + * tracking that nesting as read_depth counter. + */ + if (rwsem->read_depth <= 1) { + /* + * In case, the current thread has not taken the lock + * more than once as read lock, we can allow an + * upgrade to a write lock. rwsem_rt.h implements + * write locks as read_depth == 0. + */ + rwsem->read_depth = 0; + return (1); + } + return (0); +} +#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + int ret = 0; + unsigned long flags; + spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags); + if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE && + list_empty(&rwsem->wait_list)) { + ret = 1; + RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE; + } + spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); + return (ret); +} +#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + long val; + val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); +} +#else +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + typeof(rwsem->count) val; + val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); +} +#endif + +int +rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + if (__rwsem_tryupgrade(rwsem)) { + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + rwsem->owner = current; +#endif + return (1); + } + return (0); +} +EXPORT_SYMBOL(rwsem_tryupgrade); + +int spl_rw_init(void) { return 0; } +void spl_rw_fini(void) { } diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c new file mode 100644 index 000000000..2919a942a --- /dev/null +++ b/module/spl/spl-taskq.c @@ -0,0 +1,1305 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Task Queue Implementation. + */ + +#include <sys/taskq.h> +#include <sys/kmem.h> +#include <sys/tsd.h> + +int spl_taskq_thread_bind = 0; +module_param(spl_taskq_thread_bind, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); + + +int spl_taskq_thread_dynamic = 1; +module_param(spl_taskq_thread_dynamic, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); + +int spl_taskq_thread_priority = 1; +module_param(spl_taskq_thread_priority, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_priority, + "Allow non-default priority for taskq threads"); + +int spl_taskq_thread_sequential = 4; +module_param(spl_taskq_thread_sequential, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_sequential, + "Create new taskq threads after N sequential tasks"); + +/* Global system-wide dynamic task queue available for all consumers */ +taskq_t *system_taskq; +EXPORT_SYMBOL(system_taskq); +/* Global dynamic task queue for long delay */ +taskq_t *system_delay_taskq; +EXPORT_SYMBOL(system_delay_taskq); + +/* Private dedicated taskq for creating new taskq threads on demand. */ +static taskq_t *dynamic_taskq; +static taskq_thread_t *taskq_thread_create(taskq_t *); + +/* List of all taskqs */ +LIST_HEAD(tq_list); +DECLARE_RWSEM(tq_list_sem); +static uint_t taskq_tsd; + +static int +task_km_flags(uint_t flags) +{ + if (flags & TQ_NOSLEEP) + return (KM_NOSLEEP); + + if (flags & TQ_PUSHPAGE) + return (KM_PUSHPAGE); + + return (KM_SLEEP); +} + +/* + * taskq_find_by_name - Find the largest instance number of a named taskq. + */ +static int +taskq_find_by_name(const char *name) +{ + struct list_head *tql; + taskq_t *tq; + + list_for_each_prev(tql, &tq_list) { + tq = list_entry(tql, taskq_t, tq_taskqs); + if (strcmp(name, tq->tq_name) == 0) + return (tq->tq_instance); + } + return (-1); +} + +/* + * NOTE: Must be called with tq->tq_lock held, returns a list_t which + * is not attached to the free, work, or pending taskq lists. + */ +static taskq_ent_t * +task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) +{ + taskq_ent_t *t; + int count = 0; + + ASSERT(tq); +retry: + /* Acquire taskq_ent_t's from free list if available */ + if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); + ASSERT(!timer_pending(&t->tqent_timer)); + + list_del_init(&t->tqent_list); + return (t); + } + + /* Free list is empty and memory allocations are prohibited */ + if (flags & TQ_NOALLOC) + return (NULL); + + /* Hit maximum taskq_ent_t pool size */ + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (flags & TQ_NOSLEEP) + return (NULL); + + /* + * Sleep periodically polling the free list for an available + * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed + * but we cannot block forever waiting for an taskq_ent_t to + * show up in the free list, otherwise a deadlock can happen. + * + * Therefore, we need to allocate a new task even if the number + * of allocated tasks is above tq->tq_maxalloc, but we still + * end up delaying the task allocation by one second, thereby + * throttling the task dispatch rate. + */ + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + schedule_timeout(HZ / 100); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, + tq->tq_lock_class); + if (count < 100) { + count++; + goto retry; + } + } + + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); + + if (t) { + taskq_init_ent(t); + tq->tq_nalloc++; + } + + return (t); +} + +/* + * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t + * to already be removed from the free, work, or pending taskq lists. + */ +static void +task_free(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + ASSERT(list_empty(&t->tqent_list)); + ASSERT(!timer_pending(&t->tqent_timer)); + + kmem_free(t, sizeof (taskq_ent_t)); + tq->tq_nalloc--; +} + +/* + * NOTE: Must be called with tq->tq_lock held, either destroys the + * taskq_ent_t if too many exist or moves it to the free list for later use. + */ +static void +task_done(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + + /* Wake tasks blocked in taskq_wait_id() */ + wake_up_all(&t->tqent_waitq); + + list_del_init(&t->tqent_list); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->tqent_id = TASKQID_INVALID; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + + list_add_tail(&t->tqent_list, &tq->tq_free_list); + } else { + task_free(tq, t); + } +} + +/* + * When a delayed task timer expires remove it from the delay list and + * add it to the priority list in order for immediate processing. + */ +static void +task_expire_impl(taskq_ent_t *t) +{ + taskq_ent_t *w; + taskq_t *tq = t->tqent_taskq; + struct list_head *l; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (t->tqent_flags & TQENT_FLAG_CANCEL) { + ASSERT(list_empty(&t->tqent_list)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return; + } + + t->tqent_birth = jiffies; + /* + * The priority list must be maintained in strict task id order + * from lowest to highest for lowest_id to be easily calculable. + */ + list_del(&t->tqent_list); + list_for_each_prev(l, &tq->tq_prio_list) { + w = list_entry(l, taskq_ent_t, tqent_list); + if (w->tqent_id < t->tqent_id) { + list_add(&t->tqent_list, l); + break; + } + } + if (l == &tq->tq_prio_list) + list_add(&t->tqent_list, &tq->tq_prio_list); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + wake_up(&tq->tq_work_waitq); +} + +#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST +static void +task_expire(struct timer_list *tl) +{ + taskq_ent_t *t = from_timer(t, tl, tqent_timer); + task_expire_impl(t); +} +#else +static void +task_expire(unsigned long data) +{ + task_expire_impl((taskq_ent_t *)data); +} +#endif + +/* + * Returns the lowest incomplete taskqid_t. The taskqid_t may + * be queued on the pending list, on the priority list, on the + * delay list, or on the work list currently being handled, but + * it is not 100% complete yet. + */ +static taskqid_t +taskq_lowest_id(taskq_t *tq) +{ + taskqid_t lowest_id = tq->tq_next_id; + taskq_ent_t *t; + taskq_thread_t *tqt; + + ASSERT(tq); + + if (!list_empty(&tq->tq_pend_list)) { + t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_prio_list)) { + t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_delay_list)) { + t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_active_list)) { + tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, + tqt_active_list); + ASSERT(tqt->tqt_id != TASKQID_INVALID); + lowest_id = MIN(lowest_id, tqt->tqt_id); + } + + return (lowest_id); +} + +/* + * Insert a task into a list keeping the list sorted by increasing taskqid. + */ +static void +taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) +{ + taskq_thread_t *w; + struct list_head *l; + + ASSERT(tq); + ASSERT(tqt); + + list_for_each_prev(l, &tq->tq_active_list) { + w = list_entry(l, taskq_thread_t, tqt_active_list); + if (w->tqt_id < tqt->tqt_id) { + list_add(&tqt->tqt_active_list, l); + break; + } + } + if (l == &tq->tq_active_list) + list_add(&tqt->tqt_active_list, &tq->tq_active_list); +} + +/* + * Find and return a task from the given list if it exists. The list + * must be in lowest to highest task id order. + */ +static taskq_ent_t * +taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) +{ + struct list_head *l; + taskq_ent_t *t; + + list_for_each(l, lh) { + t = list_entry(l, taskq_ent_t, tqent_list); + + if (t->tqent_id == id) + return (t); + + if (t->tqent_id > id) + break; + } + + return (NULL); +} + +/* + * Find an already dispatched task given the task id regardless of what + * state it is in. If a task is still pending it will be returned. + * If a task is executing, then -EBUSY will be returned instead. + * If the task has already been run then NULL is returned. + */ +static taskq_ent_t * +taskq_find(taskq_t *tq, taskqid_t id) +{ + taskq_thread_t *tqt; + struct list_head *l; + taskq_ent_t *t; + + t = taskq_find_list(tq, &tq->tq_delay_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_prio_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_pend_list, id); + if (t) + return (t); + + list_for_each(l, &tq->tq_active_list) { + tqt = list_entry(l, taskq_thread_t, tqt_active_list); + if (tqt->tqt_id == id) { + /* + * Instead of returning tqt_task, we just return a non + * NULL value to prevent misuse, since tqt_task only + * has two valid fields. + */ + return (ERR_PTR(-EBUSY)); + } + } + + return (NULL); +} + +/* + * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and + * taskq_wait() functions below. + * + * Taskq waiting is accomplished by tracking the lowest outstanding task + * id and the next available task id. As tasks are dispatched they are + * added to the tail of the pending, priority, or delay lists. As worker + * threads become available the tasks are removed from the heads of these + * lists and linked to the worker threads. This ensures the lists are + * kept sorted by lowest to highest task id. + * + * Therefore the lowest outstanding task id can be quickly determined by + * checking the head item from all of these lists. This value is stored + * with the taskq as the lowest id. It only needs to be recalculated when + * either the task with the current lowest id completes or is canceled. + * + * By blocking until the lowest task id exceeds the passed task id the + * taskq_wait_outstanding() function can be easily implemented. Similarly, + * by blocking until the lowest task id matches the next task id taskq_wait() + * can be implemented. + * + * Callers should be aware that when there are multiple worked threads it + * is possible for larger task ids to complete before smaller ones. Also + * when the taskq contains delay tasks with small task ids callers may + * block for a considerable length of time waiting for them to expire and + * execute. + */ +static int +taskq_wait_id_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (taskq_find(tq, id) == NULL); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_id() function blocks until the passed task id completes. + * This does not guarantee that all lower task ids have completed. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_id); + +static int +taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (id < tq->tq_lowest_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_outstanding() function will block until all tasks with a + * lower taskqid than the passed 'id' have been completed. Note that all + * task id's are assigned monotonically at dispatch time. Zero may be + * passed for the id to indicate all tasks dispatch up to this point, + * but not after, should be waited for. + */ +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + id = id ? id : tq->tq_next_id - 1; + wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_outstanding); + +static int +taskq_wait_check(taskq_t *tq) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (tq->tq_lowest_id == tq->tq_next_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait() function will block until the taskq is empty. + * This means that if a taskq re-dispatches work to itself taskq_wait() + * callers will block indefinitely. + */ +void +taskq_wait(taskq_t *tq) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); +} +EXPORT_SYMBOL(taskq_wait); + +int +taskq_member(taskq_t *tq, kthread_t *t) +{ + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); +} +EXPORT_SYMBOL(taskq_member); + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + taskq_ent_t *t; + int rc = ENOENT; + unsigned long flags; + + ASSERT(tq); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + t = taskq_find(tq, id); + if (t && t != ERR_PTR(-EBUSY)) { + list_del_init(&t->tqent_list); + t->tqent_flags |= TQENT_FLAG_CANCEL; + + /* + * When canceling the lowest outstanding task id we + * must recalculate the new lowest outstanding id. + */ + if (tq->tq_lowest_id == t->tqent_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); + } + + /* + * The task_expire() function takes the tq->tq_lock so drop + * drop the lock before synchronously cancelling the timer. + */ + if (timer_pending(&t->tqent_timer)) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + del_timer_sync(&t->tqent_timer); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + rc = 0; + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + + if (t == ERR_PTR(-EBUSY)) { + taskq_wait_id(tq, id); + rc = EBUSY; + } + + return (rc); +} +EXPORT_SYMBOL(taskq_cancel_id); + +static int taskq_thread_spawn(taskq_t *tq); + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *t; + taskqid_t rc = TASKQID_INVALID; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + /* Do not queue the task unless there is idle thread for it */ + ASSERT(tq->tq_nactive <= tq->tq_nthreads); + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out; + } + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ + if (flags & TQ_NOQUEUE) + list_add(&t->tqent_list, &tq->tq_prio_list); + /* Queue to the priority list instead of the pending list */ + else if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; +#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST + t->tqent_timer.data = 0; +#endif + t->tqent_timer.function = NULL; + t->tqent_timer.expires = 0; + t->tqent_birth = jiffies; + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch); + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskqid_t rc = TASKQID_INVALID; + taskq_ent_t *t; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the delay list for subsequent execution */ + list_add_tail(&t->tqent_list, &tq->tq_delay_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; +#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST + t->tqent_timer.data = (unsigned long)t; +#endif + t->tqent_timer.function = task_expire; + t->tqent_timer.expires = (unsigned long)expire_time; + add_timer(&t->tqent_timer); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch_delay); + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) { + t->tqent_id = TASKQID_INVALID; + goto out; + } + + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out2; + flags |= TQ_FRONT; + } + + spin_lock(&t->tqent_lock); + + /* + * Make sure the entry is not on some other taskq; it is important to + * ASSERT() under lock + */ + ASSERT(taskq_empty_ent(t)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + + /* Queue to the priority list instead of the pending list */ + if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_birth = jiffies; + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); +out2: + spin_unlock_irqrestore(&tq->tq_lock, irqflags); +} +EXPORT_SYMBOL(taskq_dispatch_ent); + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (list_empty(&t->tqent_list)); +} +EXPORT_SYMBOL(taskq_empty_ent); + +void +taskq_init_ent(taskq_ent_t *t) +{ + spin_lock_init(&t->tqent_lock); + init_waitqueue_head(&t->tqent_waitq); +#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST + timer_setup(&t->tqent_timer, NULL, 0); +#else + init_timer(&t->tqent_timer); +#endif + INIT_LIST_HEAD(&t->tqent_list); + t->tqent_id = 0; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + t->tqent_taskq = NULL; +} +EXPORT_SYMBOL(taskq_init_ent); + +/* + * Return the next pending task, preference is given to tasks on the + * priority list which were dispatched with TQ_FRONT. + */ +static taskq_ent_t * +taskq_next_ent(taskq_t *tq) +{ + struct list_head *list; + + if (!list_empty(&tq->tq_prio_list)) + list = &tq->tq_prio_list; + else if (!list_empty(&tq->tq_pend_list)) + list = &tq->tq_pend_list; + else + return (NULL); + + return (list_entry(list->next, taskq_ent_t, tqent_list)); +} + +/* + * Spawns a new thread for the specified taskq. + */ +static void +taskq_thread_spawn_task(void *arg) +{ + taskq_t *tq = (taskq_t *)arg; + unsigned long flags; + + if (taskq_thread_create(tq) == NULL) { + /* restore spawning count if failed */ + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nspawn--; + spin_unlock_irqrestore(&tq->tq_lock, flags); + } +} + +/* + * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current + * number of threads is insufficient to handle the pending tasks. These + * new threads must be created by the dedicated dynamic_taskq to avoid + * deadlocks between thread creation and memory reclaim. The system_taskq + * which is also a dynamic taskq cannot be safely used for this. + */ +static int +taskq_thread_spawn(taskq_t *tq) +{ + int spawning = 0; + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && + (tq->tq_flags & TASKQ_ACTIVE)) { + spawning = (++tq->tq_nspawn); + taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, + tq, TQ_NOSLEEP); + } + + return (spawning); +} + +/* + * Threads in a dynamic taskq should only exit once it has been completely + * drained and no other threads are actively servicing tasks. This prevents + * threads from being created and destroyed more than is required. + * + * The first thread is the thread list is treated as the primary thread. + * There is nothing special about the primary thread but in order to avoid + * all the taskq pids from changing we opt to make it long running. + */ +static int +taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) +{ + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, + tqt_thread_list) == tqt) + return (0); + + return + ((tq->tq_nspawn == 0) && /* No threads are being spawned */ + (tq->tq_nactive == 0) && /* No threads are handling tasks */ + (tq->tq_nthreads > 1) && /* More than 1 thread is running */ + (!taskq_next_ent(tq)) && /* There are no pending tasks */ + (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ +} + +static int +taskq_thread(void *args) +{ + DECLARE_WAITQUEUE(wait, current); + sigset_t blocked; + taskq_thread_t *tqt = args; + taskq_t *tq; + taskq_ent_t *t; + int seq_tasks = 0; + unsigned long flags; + taskq_ent_t dup_task = {}; + + ASSERT(tqt); + ASSERT(tqt->tqt_tq); + tq = tqt->tqt_tq; + current->flags |= PF_NOFREEZE; + + (void) spl_fstrans_mark(); + + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + tsd_set(taskq_tsd, tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* + * If we are dynamically spawned, decrease spawning count. Note that + * we could be created during taskq_create, in which case we shouldn't + * do the decrement. But it's fine because taskq_create will reset + * tq_nspawn later. + */ + if (tq->tq_flags & TASKQ_DYNAMIC) + tq->tq_nspawn--; + + /* Immediately exit if more threads than allowed were created. */ + if (tq->tq_nthreads >= tq->tq_maxthreads) + goto error; + + tq->tq_nthreads++; + list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); + wake_up(&tq->tq_wait_waitq); + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (list_empty(&tq->tq_pend_list) && + list_empty(&tq->tq_prio_list)) { + + if (taskq_thread_should_stop(tq, tqt)) { + wake_up_all(&tq->tq_wait_waitq); + break; + } + + add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + schedule(); + seq_tasks = 0; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + remove_wait_queue(&tq->tq_work_waitq, &wait); + } else { + __set_current_state(TASK_RUNNING); + } + + if ((t = taskq_next_ent(tq)) != NULL) { + list_del_init(&t->tqent_list); + + /* + * A TQENT_FLAG_PREALLOC task may be reused or freed + * during the task function call. Store tqent_id and + * tqent_flags here. + * + * Also use an on stack taskq_ent_t for tqt_task + * assignment in this case. We only populate the two + * fields used by the only user in taskq proc file. + */ + tqt->tqt_id = t->tqent_id; + tqt->tqt_flags = t->tqent_flags; + + if (t->tqent_flags & TQENT_FLAG_PREALLOC) { + dup_task.tqent_func = t->tqent_func; + dup_task.tqent_arg = t->tqent_arg; + t = &dup_task; + } + tqt->tqt_task = t; + + taskq_insert_in_order(tq, tqt); + tq->tq_nactive++; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* Perform the requested task */ + t->tqent_func(t->tqent_arg); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nactive--; + list_del_init(&tqt->tqt_active_list); + tqt->tqt_task = NULL; + + /* For prealloc'd tasks, we don't free anything. */ + if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + /* + * When the current lowest outstanding taskqid is + * done calculate the new lowest outstanding id + */ + if (tq->tq_lowest_id == tqt->tqt_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); + } + + /* Spawn additional taskq threads if required. */ + if ((++seq_tasks) > spl_taskq_thread_sequential && + taskq_thread_spawn(tq)) + seq_tasks = 0; + + tqt->tqt_id = TASKQID_INVALID; + tqt->tqt_flags = 0; + wake_up_all(&tq->tq_wait_waitq); + } else { + if (taskq_thread_should_stop(tq, tqt)) + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + } + + __set_current_state(TASK_RUNNING); + tq->tq_nthreads--; + list_del_init(&tqt->tqt_thread_list); +error: + kmem_free(tqt, sizeof (taskq_thread_t)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + tsd_set(taskq_tsd, NULL); + + return (0); +} + +static taskq_thread_t * +taskq_thread_create(taskq_t *tq) +{ + static int last_used_cpu = 0; + taskq_thread_t *tqt; + + tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); + INIT_LIST_HEAD(&tqt->tqt_thread_list); + INIT_LIST_HEAD(&tqt->tqt_active_list); + tqt->tqt_tq = tq; + tqt->tqt_id = TASKQID_INVALID; + + tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, + "%s", tq->tq_name); + if (tqt->tqt_thread == NULL) { + kmem_free(tqt, sizeof (taskq_thread_t)); + return (NULL); + } + + if (spl_taskq_thread_bind) { + last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); + kthread_bind(tqt->tqt_thread, last_used_cpu); + } + + if (spl_taskq_thread_priority) + set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); + + wake_up_process(tqt->tqt_thread); + + return (tqt); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int count = 0, rc = 0, i; + unsigned long irqflags; + + ASSERT(name != NULL); + ASSERT(minalloc >= 0); + ASSERT(maxalloc <= INT_MAX); + ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ + + /* Scale the number of threads using nthreads as a percentage */ + if (flags & TASKQ_THREADS_CPU_PCT) { + ASSERT(nthreads <= 100); + ASSERT(nthreads >= 0); + nthreads = MIN(nthreads, 100); + nthreads = MAX(nthreads, 0); + nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + } + + tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); + if (tq == NULL) + return (NULL); + + spin_lock_init(&tq->tq_lock); + INIT_LIST_HEAD(&tq->tq_thread_list); + INIT_LIST_HEAD(&tq->tq_active_list); + tq->tq_name = strdup(name); + tq->tq_nactive = 0; + tq->tq_nthreads = 0; + tq->tq_nspawn = 0; + tq->tq_maxthreads = nthreads; + tq->tq_pri = pri; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nalloc = 0; + tq->tq_flags = (flags | TASKQ_ACTIVE); + tq->tq_next_id = TASKQID_INITIAL; + tq->tq_lowest_id = TASKQID_INITIAL; + INIT_LIST_HEAD(&tq->tq_free_list); + INIT_LIST_HEAD(&tq->tq_pend_list); + INIT_LIST_HEAD(&tq->tq_prio_list); + INIT_LIST_HEAD(&tq->tq_delay_list); + init_waitqueue_head(&tq->tq_work_waitq); + init_waitqueue_head(&tq->tq_wait_waitq); + tq->tq_lock_class = TQ_LOCK_GENERAL; + INIT_LIST_HEAD(&tq->tq_taskqs); + + if (flags & TASKQ_PREPOPULATE) { + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + for (i = 0; i < minalloc; i++) + task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, + &irqflags)); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + } + + if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) + nthreads = 1; + + for (i = 0; i < nthreads; i++) { + tqt = taskq_thread_create(tq); + if (tqt == NULL) + rc = 1; + else + count++; + } + + /* Wait for all threads to be started before potential destroy */ + wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); + /* + * taskq_thread might have touched nspawn, but we don't want them to + * because they're not dynamically spawned. So we reset it to 0 + */ + tq->tq_nspawn = 0; + + if (rc) { + taskq_destroy(tq); + tq = NULL; + } else { + down_write(&tq_list_sem); + tq->tq_instance = taskq_find_by_name(name) + 1; + list_add_tail(&tq->tq_taskqs, &tq_list); + up_write(&tq_list_sem); + } + + return (tq); +} +EXPORT_SYMBOL(taskq_create); + +void +taskq_destroy(taskq_t *tq) +{ + struct task_struct *thread; + taskq_thread_t *tqt; + taskq_ent_t *t; + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + tq->tq_flags &= ~TASKQ_ACTIVE; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* + * When TASKQ_ACTIVE is clear new tasks may not be added nor may + * new worker threads be spawned for dynamic taskq. + */ + if (dynamic_taskq != NULL) + taskq_wait_outstanding(dynamic_taskq, 0); + + taskq_wait(tq); + + /* remove taskq from global list used by the kstats */ + down_write(&tq_list_sem); + list_del(&tq->tq_taskqs); + up_write(&tq_list_sem); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* wait for spawning threads to insert themselves to the list */ + while (tq->tq_nspawn) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + schedule_timeout_interruptible(1); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + /* + * Signal each thread to exit and block until it does. Each thread + * is responsible for removing itself from the list and freeing its + * taskq_thread_t. This allows for idle threads to opt to remove + * themselves from the taskq. They can be recreated as needed. + */ + while (!list_empty(&tq->tq_thread_list)) { + tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + while (!list_empty(&tq->tq_free_list)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + list_del_init(&t->tqent_list); + task_free(tq, t); + } + + ASSERT0(tq->tq_nthreads); + ASSERT0(tq->tq_nalloc); + ASSERT0(tq->tq_nspawn); + ASSERT(list_empty(&tq->tq_thread_list)); + ASSERT(list_empty(&tq->tq_active_list)); + ASSERT(list_empty(&tq->tq_free_list)); + ASSERT(list_empty(&tq->tq_pend_list)); + ASSERT(list_empty(&tq->tq_prio_list)); + ASSERT(list_empty(&tq->tq_delay_list)); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + strfree(tq->tq_name); + kmem_free(tq, sizeof (taskq_t)); +} +EXPORT_SYMBOL(taskq_destroy); + + +static unsigned int spl_taskq_kick = 0; + +/* + * 2.6.36 API Change + * module_param_cb is introduced to take kernel_param_ops and + * module_param_call is marked as obsolete. Also set and get operations + * were changed to take a 'const struct kernel_param *'. + */ +static int +#ifdef module_param_cb +param_set_taskq_kick(const char *val, const struct kernel_param *kp) +#else +param_set_taskq_kick(const char *val, struct kernel_param *kp) +#endif +{ + int ret; + taskq_t *tq; + taskq_ent_t *t; + unsigned long flags; + + ret = param_set_uint(val, kp); + if (ret < 0 || !spl_taskq_kick) + return (ret); + /* reset value */ + spl_taskq_kick = 0; + + down_read(&tq_list_sem); + list_for_each_entry(tq, &tq_list, tq_taskqs) { + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + /* Check if the first pending is older than 5 seconds */ + t = taskq_next_ent(tq); + if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { + (void) taskq_thread_spawn(tq); + printk(KERN_INFO "spl: Kicked taskq %s/%d\n", + tq->tq_name, tq->tq_instance); + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + } + up_read(&tq_list_sem); + return (ret); +} + +#ifdef module_param_cb +static const struct kernel_param_ops param_ops_taskq_kick = { + .set = param_set_taskq_kick, + .get = param_get_uint, +}; +module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); +#else +module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, + &spl_taskq_kick, 0644); +#endif +MODULE_PARM_DESC(spl_taskq_kick, + "Write nonzero to kick stuck taskqs to spawn more threads"); + +int +spl_taskq_init(void) +{ + tsd_create(&taskq_tsd, NULL); + + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_taskq == NULL) + return (1); + + system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_delay_taskq == NULL) { + taskq_destroy(system_taskq); + return (1); + } + + dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + if (dynamic_taskq == NULL) { + taskq_destroy(system_taskq); + taskq_destroy(system_delay_taskq); + return (1); + } + + /* + * This is used to annotate tq_lock, so + * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch + * does not trigger a lockdep warning re: possible recursive locking + */ + dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; + + return (0); +} + +void +spl_taskq_fini(void) +{ + taskq_destroy(dynamic_taskq); + dynamic_taskq = NULL; + + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; + + taskq_destroy(system_taskq); + system_taskq = NULL; + + tsd_destroy(&taskq_tsd); +} diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c new file mode 100644 index 000000000..d441ad65f --- /dev/null +++ b/module/spl/spl-thread.c @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Thread Implementation. + */ + +#include <sys/thread.h> +#include <sys/kmem.h> +#include <sys/tsd.h> + +/* + * Thread interfaces + */ +typedef struct thread_priv_s { + unsigned long tp_magic; /* Magic */ + int tp_name_size; /* Name size */ + char *tp_name; /* Name (without _thread suffix) */ + void (*tp_func)(void *); /* Registered function */ + void *tp_args; /* Args to be passed to function */ + size_t tp_len; /* Len to be passed to function */ + int tp_state; /* State to start thread at */ + pri_t tp_pri; /* Priority to start threat at */ +} thread_priv_t; + +static int +thread_generic_wrapper(void *arg) +{ + thread_priv_t *tp = (thread_priv_t *)arg; + void (*func)(void *); + void *args; + + ASSERT(tp->tp_magic == TP_MAGIC); + func = tp->tp_func; + args = tp->tp_args; + set_current_state(tp->tp_state); + set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); + kmem_free(tp->tp_name, tp->tp_name_size); + kmem_free(tp, sizeof (thread_priv_t)); + + if (func) + func(args); + + return (0); +} + +void +__thread_exit(void) +{ + tsd_exit(); + complete_and_exit(NULL, 0); + /* Unreachable */ +} +EXPORT_SYMBOL(__thread_exit); + +/* + * thread_create() may block forever if it cannot create a thread or + * allocate memory. This is preferable to returning a NULL which Solaris + * style callers likely never check for... since it can't fail. + */ +kthread_t * +__thread_create(caddr_t stk, size_t stksize, thread_func_t func, + const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) +{ + thread_priv_t *tp; + struct task_struct *tsk; + char *p; + + /* Option pp is simply ignored */ + /* Variable stack size unsupported */ + ASSERT(stk == NULL); + + tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); + if (tp == NULL) + return (NULL); + + tp->tp_magic = TP_MAGIC; + tp->tp_name_size = strlen(name) + 1; + + tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); + if (tp->tp_name == NULL) { + kmem_free(tp, sizeof (thread_priv_t)); + return (NULL); + } + + strncpy(tp->tp_name, name, tp->tp_name_size); + + /* + * Strip trailing "_thread" from passed name which will be the func + * name since the exposed API has no parameter for passing a name. + */ + p = strstr(tp->tp_name, "_thread"); + if (p) + p[0] = '\0'; + + tp->tp_func = func; + tp->tp_args = args; + tp->tp_len = len; + tp->tp_state = state; + tp->tp_pri = pri; + + tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, + "%s", tp->tp_name); + if (IS_ERR(tsk)) + return (NULL); + + wake_up_process(tsk); + return ((kthread_t *)tsk); +} +EXPORT_SYMBOL(__thread_create); + +/* + * spl_kthread_create - Wrapper providing pre-3.13 semantics for + * kthread_create() in which it is not killable and less likely + * to return -ENOMEM. + */ +struct task_struct * +spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) +{ + struct task_struct *tsk; + va_list args; + char name[TASK_COMM_LEN]; + + va_start(args, namefmt); + vsnprintf(name, sizeof (name), namefmt, args); + va_end(args); + do { + tsk = kthread_create(func, data, "%s", name); + if (IS_ERR(tsk)) { + if (signal_pending(current)) { + clear_thread_flag(TIF_SIGPENDING); + continue; + } + if (PTR_ERR(tsk) == -ENOMEM) + continue; + return (NULL); + } else + return (tsk); + } while (1); +} +EXPORT_SYMBOL(spl_kthread_create); diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c new file mode 100644 index 000000000..4c800292a --- /dev/null +++ b/module/spl/spl-tsd.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * Solaris Porting Layer (SPL) Thread Specific Data Implementation. + * + * Thread specific data has implemented using a hash table, this avoids + * the need to add a member to the task structure and allows maximum + * portability between kernels. This implementation has been optimized + * to keep the tsd_set() and tsd_get() times as small as possible. + * + * The majority of the entries in the hash table are for specific tsd + * entries. These entries are hashed by the product of their key and + * pid because by design the key and pid are guaranteed to be unique. + * Their product also has the desirable properly that it will be uniformly + * distributed over the hash bins providing neither the pid nor key is zero. + * Under linux the zero pid is always the init process and thus won't be + * used, and this implementation is careful to never to assign a zero key. + * By default the hash table is sized to 512 bins which is expected to + * be sufficient for light to moderate usage of thread specific data. + * + * The hash table contains two additional type of entries. They first + * type is entry is called a 'key' entry and it is added to the hash during + * tsd_create(). It is used to store the address of the destructor function + * and it is used as an anchor point. All tsd entries which use the same + * key will be linked to this entry. This is used during tsd_destory() to + * quickly call the destructor function for all tsd associated with the key. + * The 'key' entry may be looked up with tsd_hash_search() by passing the + * key you wish to lookup and DTOR_PID constant as the pid. + * + * The second type of entry is called a 'pid' entry and it is added to the + * hash the first time a process set a key. The 'pid' entry is also used + * as an anchor and all tsd for the process will be linked to it. This + * list is using during tsd_exit() to ensure all registered destructors + * are run for the process. The 'pid' entry may be looked up with + * tsd_hash_search() by passing the PID_KEY constant as the key, and + * the process pid. Note that tsd_exit() is called by thread_exit() + * so if your using the Solaris thread API you should not need to call + * tsd_exit() directly. + * + */ + +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/tsd.h> +#include <linux/hash.h> + +typedef struct tsd_hash_bin { + spinlock_t hb_lock; + struct hlist_head hb_head; +} tsd_hash_bin_t; + +typedef struct tsd_hash_table { + spinlock_t ht_lock; + uint_t ht_bits; + uint_t ht_key; + tsd_hash_bin_t *ht_bins; +} tsd_hash_table_t; + +typedef struct tsd_hash_entry { + uint_t he_key; + pid_t he_pid; + dtor_func_t he_dtor; + void *he_value; + struct hlist_node he_list; + struct list_head he_key_list; + struct list_head he_pid_list; +} tsd_hash_entry_t; + +static tsd_hash_table_t *tsd_hash_table = NULL; + + +/* + * tsd_hash_search - searches hash table for tsd_hash_entry + * @table: hash table + * @key: search key + * @pid: search pid + */ +static tsd_hash_entry_t * +tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) +{ + struct hlist_node *node; + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + hlist_for_each(node, &bin->hb_head) { + entry = list_entry(node, tsd_hash_entry_t, he_list); + if ((entry->he_key == key) && (entry->he_pid == pid)) { + spin_unlock(&bin->hb_lock); + return (entry); + } + } + + spin_unlock(&bin->hb_lock); + return (NULL); +} + +/* + * tsd_hash_dtor - call the destructor and free all entries on the list + * @work: list of hash entries + * + * For a list of entries which have all already been removed from the + * hash call their registered destructor then free the associated memory. + */ +static void +tsd_hash_dtor(struct hlist_head *work) +{ + tsd_hash_entry_t *entry; + + while (!hlist_empty(work)) { + entry = hlist_entry(work->first, tsd_hash_entry_t, he_list); + hlist_del(&entry->he_list); + + if (entry->he_dtor && entry->he_pid != DTOR_PID) + entry->he_dtor(entry->he_value); + + kmem_free(entry, sizeof (tsd_hash_entry_t)); + } +} + +/* + * tsd_hash_add - adds an entry to hash table + * @table: hash table + * @key: search key + * @pid: search pid + * + * The caller is responsible for ensuring the unique key/pid do not + * already exist in the hash table. This possible because all entries + * are thread specific thus a concurrent thread will never attempt to + * add this key/pid. Because multiple bins must be checked to add + * links to the dtor and pid entries the entire table is locked. + */ +static int +tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) +{ + tsd_hash_entry_t *entry, *dtor_entry, *pid_entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int rc = 0; + + ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); + + /* New entry allocate structure, set value, and add to hash */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + entry->he_key = key; + entry->he_pid = pid; + entry->he_value = value; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + spin_lock(&table->ht_lock); + + /* Destructor entry must exist for all valid keys */ + dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID); + ASSERT3P(dtor_entry, !=, NULL); + entry->he_dtor = dtor_entry->he_dtor; + + /* Process entry must exist for all valid processes */ + pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid); + ASSERT3P(pid_entry, !=, NULL); + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + /* Add to the hash, key, and pid lists */ + hlist_add_head(&entry->he_list, &bin->hb_head); + list_add(&entry->he_key_list, &dtor_entry->he_key_list); + list_add(&entry->he_pid_list, &pid_entry->he_pid_list); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (rc); +} + +/* + * tsd_hash_add_key - adds a destructor entry to the hash table + * @table: hash table + * @keyp: search key + * @dtor: key destructor + * + * For every unique key there is a single entry in the hash which is used + * as anchor. All other thread specific entries for this key are linked + * to this anchor via the 'he_key_list' list head. On return they keyp + * will be set to the next available key for the hash table. + */ +static int +tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) +{ + tsd_hash_entry_t *tmp_entry, *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int keys_checked = 0; + + ASSERT3P(table, !=, NULL); + + /* Allocate entry to be used as a destructor for this key */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + /* Determine next available key value */ + spin_lock(&table->ht_lock); + do { + /* Limited to TSD_KEYS_MAX concurrent unique keys */ + if (table->ht_key++ > TSD_KEYS_MAX) + table->ht_key = 1; + + /* Ensure failure when all TSD_KEYS_MAX keys are in use */ + if (keys_checked++ >= TSD_KEYS_MAX) { + spin_unlock(&table->ht_lock); + return (ENOENT); + } + + tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID); + } while (tmp_entry); + + /* Add destructor entry in to hash table */ + entry->he_key = *keyp = table->ht_key; + entry->he_pid = DTOR_PID; + entry->he_dtor = dtor; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_add_pid - adds a process entry to the hash table + * @table: hash table + * @pid: search pid + * + * For every process these is a single entry in the hash which is used + * as anchor. All other thread specific entries for this process are + * linked to this anchor via the 'he_pid_list' list head. + */ +static int +tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid) +{ + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + /* Allocate entry to be used as the process reference */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + spin_lock(&table->ht_lock); + entry->he_key = PID_KEY; + entry->he_pid = pid; + entry->he_dtor = NULL; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_del - delete an entry from hash table, key, and pid lists + * @table: hash table + * @key: search key + * @pid: search pid + */ +static void +tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry) +{ + hlist_del(&entry->he_list); + list_del_init(&entry->he_key_list); + list_del_init(&entry->he_pid_list); +} + +/* + * tsd_hash_table_init - allocate a hash table + * @bits: hash table size + * + * A hash table with 2^bits bins will be created, it may not be resized + * after the fact and must be free'd with tsd_hash_table_fini(). + */ +static tsd_hash_table_t * +tsd_hash_table_init(uint_t bits) +{ + tsd_hash_table_t *table; + int hash, size = (1 << bits); + + table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP); + if (table == NULL) + return (NULL); + + table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP); + if (table->ht_bins == NULL) { + kmem_free(table, sizeof (tsd_hash_table_t)); + return (NULL); + } + + for (hash = 0; hash < size; hash++) { + spin_lock_init(&table->ht_bins[hash].hb_lock); + INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head); + } + + spin_lock_init(&table->ht_lock); + table->ht_bits = bits; + table->ht_key = 1; + + return (table); +} + +/* + * tsd_hash_table_fini - free a hash table + * @table: hash table + * + * Free a hash table allocated by tsd_hash_table_init(). If the hash + * table is not empty this function will call the proper destructor for + * all remaining entries before freeing the memory used by those entries. + */ +static void +tsd_hash_table_fini(tsd_hash_table_t *table) +{ + HLIST_HEAD(work); + tsd_hash_bin_t *bin; + tsd_hash_entry_t *entry; + int size, i; + + ASSERT3P(table, !=, NULL); + spin_lock(&table->ht_lock); + for (i = 0, size = (1 << table->ht_bits); i < size; i++) { + bin = &table->ht_bins[i]; + spin_lock(&bin->hb_lock); + while (!hlist_empty(&bin->hb_head)) { + entry = hlist_entry(bin->hb_head.first, + tsd_hash_entry_t, he_list); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + } + spin_unlock(&bin->hb_lock); + } + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits)); + kmem_free(table, sizeof (tsd_hash_table_t)); +} + +/* + * tsd_remove_entry - remove a tsd entry for this thread + * @entry: entry to remove + * + * Remove the thread specific data @entry for this thread. + * If this is the last entry for this thread, also remove the PID entry. + */ +static void +tsd_remove_entry(tsd_hash_entry_t *entry) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + ASSERT3P(entry, !=, NULL); + + spin_lock(&table->ht_lock); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + /* save the possible pid_entry */ + pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t, + he_pid_list); + + /* remove entry */ + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + + /* if pid_entry is indeed pid_entry, then remove it if it's empty */ + if (pid_entry->he_key == PID_KEY && + list_empty(&pid_entry->he_pid_list)) { + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + } + + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + */ +int +tsd_set(uint_t key, void *value) +{ + tsd_hash_table_t *table; + tsd_hash_entry_t *entry; + pid_t pid; + int rc; + /* mark remove if value is NULL */ + boolean_t remove = (value == NULL); + + table = tsd_hash_table; + pid = curthread->pid; + ASSERT3P(table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (EINVAL); + + /* Entry already exists in hash table update value */ + entry = tsd_hash_search(table, key, pid); + if (entry) { + entry->he_value = value; + /* remove the entry */ + if (remove) + tsd_remove_entry(entry); + return (0); + } + + /* don't create entry if value is NULL */ + if (remove) + return (0); + + /* Add a process entry to the hash if not yet exists */ + entry = tsd_hash_search(table, PID_KEY, pid); + if (entry == NULL) { + rc = tsd_hash_add_pid(table, pid); + if (rc) + return (rc); + } + + rc = tsd_hash_add(table, key, pid, value); + return (rc); +} +EXPORT_SYMBOL(tsd_set); + +/* + * tsd_get - get thread specific data + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get(uint_t key) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, curthread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get); + +/* + * tsd_get_by_thread - get thread specific data for specified thread + * @key: lookup key + * @thread: thread to lookup + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, kthread_t *thread) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, thread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get_by_thread); + +/* + * tsd_create - create thread specific data key + * @keyp: lookup key address + * @dtor: destructor called during tsd_destroy() or tsd_exit() + * + * Provided key must be set to 0 or it assumed to be already in use. + * The dtor is allowed to be NULL in which case no additional cleanup + * for the data is performed during tsd_destroy() or tsd_exit(). + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + ASSERT3P(keyp, !=, NULL); + if (*keyp) + return; + + (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor); +} +EXPORT_SYMBOL(tsd_create); + +/* + * tsd_destroy - destroy thread specific data + * @keyp: lookup key address + * + * Destroys the thread specific data on all threads which use this key. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_destroy(uint_t *keyp) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *dtor_entry, *entry; + tsd_hash_bin_t *dtor_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID); + if (dtor_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All threads which use this key must be linked off of the + * DTOR_PID entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + while (!list_empty(&dtor_entry->he_key_list)) { + entry = list_entry(dtor_entry->he_key_list.next, + tsd_hash_entry_t, he_key_list); + ASSERT3U(dtor_entry->he_key, ==, entry->he_key); + ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)dtor_entry->he_key * + (ulong_t)dtor_entry->he_pid, table->ht_bits); + dtor_entry_bin = &table->ht_bins[hash]; + + spin_lock(&dtor_entry_bin->hb_lock); + tsd_hash_del(table, dtor_entry); + hlist_add_head(&dtor_entry->he_list, &work); + spin_unlock(&dtor_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + *keyp = 0; +} +EXPORT_SYMBOL(tsd_destroy); + +/* + * tsd_exit - destroys all thread specific data for this thread + * + * Destroys all the thread specific data for this thread. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_exit(void) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry, *entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid); + if (pid_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All keys associated with this pid must be linked off of the + * PID_KEY entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + + while (!list_empty(&pid_entry->he_pid_list)) { + entry = list_entry(pid_entry->he_pid_list.next, + tsd_hash_entry_t, he_pid_list); + ASSERT3U(pid_entry->he_pid, ==, entry->he_pid); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} +EXPORT_SYMBOL(tsd_exit); + +int +spl_tsd_init(void) +{ + tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); + if (tsd_hash_table == NULL) + return (1); + + return (0); +} + +void +spl_tsd_fini(void) +{ + tsd_hash_table_fini(tsd_hash_table); + tsd_hash_table = NULL; +} diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c new file mode 100644 index 000000000..e1a84a911 --- /dev/null +++ b/module/spl/spl-vmem.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/debug.h> +#include <sys/vmem.h> +#include <sys/kmem_cache.h> +#include <sys/shrinker.h> +#include <linux/module.h> + +vmem_t *heap_arena = NULL; +EXPORT_SYMBOL(heap_arena); + +vmem_t *zio_alloc_arena = NULL; +EXPORT_SYMBOL(zio_alloc_arena); + +vmem_t *zio_arena = NULL; +EXPORT_SYMBOL(zio_arena); + +#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */ + +/* + * Return approximate virtual memory usage based on these assumptions: + * + * 1) The major SPL consumer of virtual memory is the kmem cache. + * 2) Memory allocated with vmem_alloc() is short lived and can be ignored. + * 3) Allow a 4MB floor as a generous pad given normal consumption. + * 4) The spl_kmem_cache_sem only contends with cache create/destroy. + */ +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + spl_kmem_cache_t *skc; + size_t alloc = VMEM_FLOOR_SIZE; + + if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE)) + return (VMALLOC_TOTAL); + + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (skc->skc_flags & KMC_VMEM) + alloc += skc->skc_slab_size * skc->skc_slab_total; + } + up_read(&spl_kmem_cache_sem); + + if (typemask & VMEM_ALLOC) + return (MIN(alloc, VMALLOC_TOTAL)); + else if (typemask & VMEM_FREE) + return (MAX(VMALLOC_TOTAL - alloc, 0)); + else + return (0); +} +EXPORT_SYMBOL(vmem_size); + +/* + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. + */ +void * +spl_vmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_VMEM; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_alloc); + +void * +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= (KM_VMEM | KM_ZERO); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_zalloc); + +void +spl_vmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_vmem_free); + +int +spl_vmem_init(void) +{ + return (0); +} + +void +spl_vmem_fini(void) +{ +} diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c new file mode 100644 index 000000000..28ce21276 --- /dev/null +++ b/module/spl/spl-vnode.c @@ -0,0 +1,779 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Vnode Implementation. + */ + +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/kmem_cache.h> +#include <linux/falloc.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#ifdef HAVE_FDTABLE_HEADER +#include <linux/fdtable.h> +#endif + +vnode_t *rootdir = (vnode_t *)0xabcd1234; +EXPORT_SYMBOL(rootdir); + +static spl_kmem_cache_t *vn_cache; +static spl_kmem_cache_t *vn_file_cache; + +static DEFINE_SPINLOCK(vn_file_lock); +static LIST_HEAD(vn_file_list); + +static int +spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) +{ + int error = -EOPNOTSUPP; + +#ifdef HAVE_FILE_FALLOCATE + if (fp->f_op->fallocate) + error = fp->f_op->fallocate(fp, mode, offset, len); +#else +#ifdef HAVE_INODE_FALLOCATE + if (fp->f_dentry && fp->f_dentry->d_inode && + fp->f_dentry->d_inode->i_op->fallocate) + error = fp->f_dentry->d_inode->i_op->fallocate( + fp->f_dentry->d_inode, mode, offset, len); +#endif /* HAVE_INODE_FALLOCATE */ +#endif /* HAVE_FILE_FALLOCATE */ + + return (error); +} + +static int +spl_filp_fsync(struct file *fp, int sync) +{ +#ifdef HAVE_2ARGS_VFS_FSYNC + return (vfs_fsync(fp, sync)); +#else + return (vfs_fsync(fp, (fp)->f_dentry, sync)); +#endif /* HAVE_2ARGS_VFS_FSYNC */ +} + +static ssize_t +spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_WRITE_PPOS) + return (kernel_write(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(get_ds()); + + ret = vfs_write(file, (__force const char __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +static ssize_t +spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(get_ds()); + + ret = vfs_read(file, (void __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +vtype_t +vn_mode_to_vtype(mode_t mode) +{ + if (S_ISREG(mode)) + return (VREG); + + if (S_ISDIR(mode)) + return (VDIR); + + if (S_ISCHR(mode)) + return (VCHR); + + if (S_ISBLK(mode)) + return (VBLK); + + if (S_ISFIFO(mode)) + return (VFIFO); + + if (S_ISLNK(mode)) + return (VLNK); + + if (S_ISSOCK(mode)) + return (VSOCK); + + return (VNON); +} /* vn_mode_to_vtype() */ +EXPORT_SYMBOL(vn_mode_to_vtype); + +mode_t +vn_vtype_to_mode(vtype_t vtype) +{ + if (vtype == VREG) + return (S_IFREG); + + if (vtype == VDIR) + return (S_IFDIR); + + if (vtype == VCHR) + return (S_IFCHR); + + if (vtype == VBLK) + return (S_IFBLK); + + if (vtype == VFIFO) + return (S_IFIFO); + + if (vtype == VLNK) + return (S_IFLNK); + + if (vtype == VSOCK) + return (S_IFSOCK); + + return (VNON); +} /* vn_vtype_to_mode() */ +EXPORT_SYMBOL(vn_vtype_to_mode); + +vnode_t * +vn_alloc(int flag) +{ + vnode_t *vp; + + vp = kmem_cache_alloc(vn_cache, flag); + if (vp != NULL) { + vp->v_file = NULL; + vp->v_type = 0; + } + + return (vp); +} /* vn_alloc() */ +EXPORT_SYMBOL(vn_alloc); + +void +vn_free(vnode_t *vp) +{ + kmem_cache_free(vn_cache, vp); +} /* vn_free() */ +EXPORT_SYMBOL(vn_free); + +int +vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp, + int x1, void *x2) +{ + struct file *fp; + struct kstat stat; + int rc, saved_umask = 0; + gfp_t saved_gfp; + vnode_t *vp; + + ASSERT(flags & (FWRITE | FREAD)); + ASSERT(seg == UIO_SYSSPACE); + ASSERT(vpp); + *vpp = NULL; + + if (!(flags & FCREAT) && (flags & FWRITE)) + flags |= FEXCL; + + /* + * Note for filp_open() the two low bits must be remapped to mean: + * 01 - read-only -> 00 read-only + * 10 - write-only -> 01 write-only + * 11 - read-write -> 10 read-write + */ + flags--; + + if (flags & FCREAT) + saved_umask = xchg(¤t->fs->umask, 0); + + fp = filp_open(path, flags, mode); + + if (flags & FCREAT) + (void) xchg(¤t->fs->umask, saved_umask); + + if (IS_ERR(fp)) + return (-PTR_ERR(fp)); + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat); +#else + rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); +#endif + if (rc) { + filp_close(fp, 0); + return (-rc); + } + + vp = vn_alloc(KM_SLEEP); + if (!vp) { + filp_close(fp, 0); + return (ENOMEM); + } + + saved_gfp = mapping_gfp_mask(fp->f_mapping); + mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS)); + + mutex_enter(&vp->v_lock); + vp->v_type = vn_mode_to_vtype(stat.mode); + vp->v_file = fp; + vp->v_gfp_mask = saved_gfp; + *vpp = vp; + mutex_exit(&vp->v_lock); + + return (0); +} /* vn_open() */ +EXPORT_SYMBOL(vn_open); + +int +vn_openat(const char *path, uio_seg_t seg, int flags, int mode, + vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd) +{ + char *realpath; + int len, rc; + + ASSERT(vp == rootdir); + + len = strlen(path) + 2; + realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP)); + if (!realpath) + return (ENOMEM); + + (void) snprintf(realpath, len, "/%s", path); + rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2); + kfree(realpath); + + return (rc); +} /* vn_openat() */ +EXPORT_SYMBOL(vn_openat); + +int +vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off, + uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp) +{ + struct file *fp = vp->v_file; + loff_t offset = off; + int rc; + + ASSERT(uio == UIO_WRITE || uio == UIO_READ); + ASSERT(seg == UIO_SYSSPACE); + ASSERT((ioflag & ~FAPPEND) == 0); + + if (ioflag & FAPPEND) + offset = fp->f_pos; + + if (uio & UIO_WRITE) + rc = spl_kernel_write(fp, addr, len, &offset); + else + rc = spl_kernel_read(fp, addr, len, &offset); + + fp->f_pos = offset; + + if (rc < 0) + return (-rc); + + if (residp) { + *residp = len - rc; + } else { + if (rc != len) + return (EIO); + } + + return (0); +} /* vn_rdwr() */ +EXPORT_SYMBOL(vn_rdwr); + +int +vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4) +{ + int rc; + + ASSERT(vp); + ASSERT(vp->v_file); + + mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask); + rc = filp_close(vp->v_file, 0); + vn_free(vp); + + return (-rc); +} /* vn_close() */ +EXPORT_SYMBOL(vn_close); + +/* + * vn_seek() does not actually seek it only performs bounds checking on the + * proposed seek. We perform minimal checking and allow vn_rdwr() to catch + * anything more serious. + */ +int +vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct) +{ + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} +EXPORT_SYMBOL(vn_seek); + +int +vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4) +{ + struct file *fp; + struct kstat stat; + int rc; + + ASSERT(vp); + ASSERT(vp->v_file); + ASSERT(vap); + + fp = vp->v_file; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat); +#else + rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); +#endif + if (rc) + return (-rc); + + vap->va_type = vn_mode_to_vtype(stat.mode); + vap->va_mode = stat.mode; + vap->va_uid = KUID_TO_SUID(stat.uid); + vap->va_gid = KGID_TO_SGID(stat.gid); + vap->va_fsid = 0; + vap->va_nodeid = stat.ino; + vap->va_nlink = stat.nlink; + vap->va_size = stat.size; + vap->va_blksize = stat.blksize; + vap->va_atime = stat.atime; + vap->va_mtime = stat.mtime; + vap->va_ctime = stat.ctime; + vap->va_rdev = stat.rdev; + vap->va_nblocks = stat.blocks; + + return (0); +} +EXPORT_SYMBOL(vn_getattr); + +int +vn_fsync(vnode_t *vp, int flags, void *x3, void *x4) +{ + int datasync = 0; + int error; + int fstrans; + + ASSERT(vp); + ASSERT(vp->v_file); + + if (flags & FDSYNC) + datasync = 1; + + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + error = -spl_filp_fsync(vp->v_file, datasync); + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} /* vn_fsync() */ +EXPORT_SYMBOL(vn_fsync); + +int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, + offset_t offset, void *x6, void *x7) +{ + int error = EOPNOTSUPP; +#ifdef FALLOC_FL_PUNCH_HOLE + int fstrans; +#endif + + if (cmd != F_FREESP || bfp->l_whence != 0) + return (EOPNOTSUPP); + + ASSERT(vp); + ASSERT(vp->v_file); + ASSERT(bfp->l_start >= 0 && bfp->l_len > 0); + +#ifdef FALLOC_FL_PUNCH_HOLE + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + /* + * When supported by the underlying file system preferentially + * use the fallocate() callback to preallocate the space. + */ + error = -spl_filp_fallocate(vp->v_file, + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + bfp->l_start, bfp->l_len); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + if (error == 0) + return (0); +#endif + +#ifdef HAVE_INODE_TRUNCATE_RANGE + if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode && + vp->v_file->f_dentry->d_inode->i_op && + vp->v_file->f_dentry->d_inode->i_op->truncate_range) { + off_t end = bfp->l_start + bfp->l_len; + /* + * Judging from the code in shmem_truncate_range(), + * it seems the kernel expects the end offset to be + * inclusive and aligned to the end of a page. + */ + if (end % PAGE_SIZE != 0) { + end &= ~(off_t)(PAGE_SIZE - 1); + if (end <= bfp->l_start) + return (0); + } + --end; + + vp->v_file->f_dentry->d_inode->i_op->truncate_range( + vp->v_file->f_dentry->d_inode, bfp->l_start, end); + + return (0); + } +#endif + + return (error); +} +EXPORT_SYMBOL(vn_space); + +/* Function must be called while holding the vn_file_lock */ +static file_t * +file_find(int fd, struct task_struct *task) +{ + file_t *fp; + + list_for_each_entry(fp, &vn_file_list, f_list) { + if (fd == fp->f_fd && fp->f_task == task) { + ASSERT(atomic_read(&fp->f_ref) != 0); + return (fp); + } + } + + return (NULL); +} /* file_find() */ + +file_t * +vn_getf(int fd) +{ + struct kstat stat; + struct file *lfp; + file_t *fp; + vnode_t *vp; + int rc = 0; + + if (fd < 0) + return (NULL); + + /* Already open just take an extra reference */ + spin_lock(&vn_file_lock); + + fp = file_find(fd, current); + if (fp) { + lfp = fget(fd); + fput(fp->f_file); + /* + * areleasef() can cause us to see a stale reference when + * userspace has reused a file descriptor before areleasef() + * has run. fput() the stale reference and replace it. We + * retain the original reference count such that the concurrent + * areleasef() will decrement its reference and terminate. + */ + if (lfp != fp->f_file) { + fp->f_file = lfp; + fp->f_vnode->v_file = lfp; + } + atomic_inc(&fp->f_ref); + spin_unlock(&vn_file_lock); + return (fp); + } + + spin_unlock(&vn_file_lock); + + /* File was not yet opened create the object and setup */ + fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP); + if (fp == NULL) + goto out; + + mutex_enter(&fp->f_lock); + + fp->f_fd = fd; + fp->f_task = current; + fp->f_offset = 0; + atomic_inc(&fp->f_ref); + + lfp = fget(fd); + if (lfp == NULL) + goto out_mutex; + + vp = vn_alloc(KM_SLEEP); + if (vp == NULL) + goto out_fget; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&lfp->f_path, &stat); +#else + rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat); +#endif + if (rc) + goto out_vnode; + + mutex_enter(&vp->v_lock); + vp->v_type = vn_mode_to_vtype(stat.mode); + vp->v_file = lfp; + mutex_exit(&vp->v_lock); + + fp->f_vnode = vp; + fp->f_file = lfp; + + /* Put it on the tracking list */ + spin_lock(&vn_file_lock); + list_add(&fp->f_list, &vn_file_list); + spin_unlock(&vn_file_lock); + + mutex_exit(&fp->f_lock); + return (fp); + +out_vnode: + vn_free(vp); +out_fget: + fput(lfp); +out_mutex: + mutex_exit(&fp->f_lock); + kmem_cache_free(vn_file_cache, fp); +out: + return (NULL); +} /* getf() */ +EXPORT_SYMBOL(getf); + +static void releasef_locked(file_t *fp) +{ + ASSERT(fp->f_file); + ASSERT(fp->f_vnode); + + /* Unlinked from list, no refs, safe to free outside mutex */ + fput(fp->f_file); + vn_free(fp->f_vnode); + + kmem_cache_free(vn_file_cache, fp); +} + +void +vn_releasef(int fd) +{ + areleasef(fd, P_FINFO(current)); +} +EXPORT_SYMBOL(releasef); + +void +vn_areleasef(int fd, uf_info_t *fip) +{ + file_t *fp; + struct task_struct *task = (struct task_struct *)fip; + + if (fd < 0) + return; + + spin_lock(&vn_file_lock); + fp = file_find(fd, task); + if (fp) { + atomic_dec(&fp->f_ref); + if (atomic_read(&fp->f_ref) > 0) { + spin_unlock(&vn_file_lock); + return; + } + + list_del(&fp->f_list); + releasef_locked(fp); + } + spin_unlock(&vn_file_lock); +} /* releasef() */ +EXPORT_SYMBOL(areleasef); + + +static void +#ifdef HAVE_SET_FS_PWD_WITH_CONST +vn_set_fs_pwd(struct fs_struct *fs, const struct path *path) +#else +vn_set_fs_pwd(struct fs_struct *fs, struct path *path) +#endif /* HAVE_SET_FS_PWD_WITH_CONST */ +{ + struct path old_pwd; + +#ifdef HAVE_FS_STRUCT_SPINLOCK + spin_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + spin_unlock(&fs->lock); +#else + write_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + write_unlock(&fs->lock); +#endif /* HAVE_FS_STRUCT_SPINLOCK */ + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +int +vn_set_pwd(const char *filename) +{ + struct path path; + mm_segment_t saved_fs; + int rc; + + /* + * user_path_dir() and __user_walk() both expect 'filename' to be + * a user space address so we must briefly increase the data segment + * size to ensure strncpy_from_user() does not fail with -EFAULT. + */ + saved_fs = get_fs(); + set_fs(get_ds()); + + rc = user_path_dir(filename, &path); + if (rc) + goto out; + + rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + if (rc) + goto dput_and_out; + + vn_set_fs_pwd(current->fs, &path); + +dput_and_out: + path_put(&path); +out: + set_fs(saved_fs); + + return (-rc); +} /* vn_set_pwd() */ +EXPORT_SYMBOL(vn_set_pwd); + +static int +vn_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct vnode *vp = buf; + + mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} /* vn_cache_constructor() */ + +static void +vn_cache_destructor(void *buf, void *cdrarg) +{ + struct vnode *vp = buf; + + mutex_destroy(&vp->v_lock); +} /* vn_cache_destructor() */ + +static int +vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + file_t *fp = buf; + + atomic_set(&fp->f_ref, 0); + mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&fp->f_list); + + return (0); +} /* vn_file_cache_constructor() */ + +static void +vn_file_cache_destructor(void *buf, void *cdrarg) +{ + file_t *fp = buf; + + mutex_destroy(&fp->f_lock); +} /* vn_file_cache_destructor() */ + +int +spl_vn_init(void) +{ + vn_cache = kmem_cache_create("spl_vn_cache", + sizeof (struct vnode), 64, vn_cache_constructor, + vn_cache_destructor, NULL, NULL, NULL, 0); + + vn_file_cache = kmem_cache_create("spl_vn_file_cache", + sizeof (file_t), 64, vn_file_cache_constructor, + vn_file_cache_destructor, NULL, NULL, NULL, 0); + + return (0); +} /* spl_vn_init() */ + +void +spl_vn_fini(void) +{ + file_t *fp, *next_fp; + int leaked = 0; + + spin_lock(&vn_file_lock); + + list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) { + list_del(&fp->f_list); + releasef_locked(fp); + leaked++; + } + + spin_unlock(&vn_file_lock); + + if (leaked > 0) + printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked); + + kmem_cache_destroy(vn_file_cache); + kmem_cache_destroy(vn_cache); +} /* spl_vn_fini() */ diff --git a/module/spl/spl-xdr.c b/module/spl/spl-xdr.c new file mode 100644 index 000000000..2cc3e2a03 --- /dev/null +++ b/module/spl/spl-xdr.c @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2008-2010 Sun Microsystems, Inc. + * Written by Ricardo Correia <[email protected]> + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) XDR Implementation. + */ + +#include <linux/string.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <rpc/xdr.h> + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +typedef int bool_t; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; + + if (req != XDR_GET_BYTES_AVAIL) + return (FALSE); + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; + + return (TRUE); +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return (FALSE); + + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr)); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + val = *((unsigned char *) cp); + + return (xdrmem_enc_uint32(xdrs, val)); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return (FALSE); + + *((unsigned char *) cp) = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + return (xdrmem_enc_uint32(xdrs, *usp)); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return (FALSE); + + *usp = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_enc_uint32(xdrs, *up)); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return (FALSE); + + return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return (FALSE); + if (!xdrmem_dec_uint32(xdrs, &low)) + return (FALSE); + + *ullp = ((u_longlong_t)high << 32) | low; + + return (TRUE); +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return (FALSE); + + if (!xdrmem_enc_uint(xdrs, sizep)) + return (FALSE); + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return (FALSE); + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return (FALSE); + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return (FALSE); + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return (FALSE); + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return (FALSE); + } + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return (FALSE); + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return (FALSE); + + return (xdrmem_enc_bytes(xdrs, *sp, len)); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return (FALSE); + + if (size > maxsize || size > UINT_MAX - 1) + return (FALSE); + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return (FALSE); + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (memchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return (TRUE); + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return (FALSE); +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c new file mode 100644 index 000000000..229e6a44b --- /dev/null +++ b/module/spl/spl-zlib.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * z_compress_level/z_uncompress are nearly identical copies of the + * compress2/uncompress functions provided by the official zlib package + * available at http://zlib.net/. The only changes made we to slightly + * adapt the functions called to match the linux kernel implementation + * of zlib. The full zlib license follows: + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/zmod.h> + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + return (1); + + return (0); +} + +void +spl_zlib_fini(void) +{ + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; +} |