diff options
Diffstat (limited to 'module/spl')
-rw-r--r-- | module/spl/Makefile.in | 20 | ||||
-rw-r--r-- | module/spl/README.md | 16 | ||||
-rw-r--r-- | module/spl/THIRDPARTYLICENSE.gplv2 | 339 | ||||
-rw-r--r-- | module/spl/THIRDPARTYLICENSE.gplv2.descrip | 1 | ||||
-rw-r--r-- | module/spl/spl-atomic.c | 36 | ||||
-rw-r--r-- | module/spl/spl-condvar.c | 461 | ||||
-rw-r--r-- | module/spl/spl-cred.c | 200 | ||||
-rw-r--r-- | module/spl/spl-err.c | 124 | ||||
-rw-r--r-- | module/spl/spl-generic.c | 757 | ||||
-rw-r--r-- | module/spl/spl-kmem-cache.c | 1780 | ||||
-rw-r--r-- | module/spl/spl-kmem.c | 556 | ||||
-rw-r--r-- | module/spl/spl-kobj.c | 86 | ||||
-rw-r--r-- | module/spl/spl-kstat.c | 770 | ||||
-rw-r--r-- | module/spl/spl-proc.c | 782 | ||||
-rw-r--r-- | module/spl/spl-procfs-list.c | 257 | ||||
-rw-r--r-- | module/spl/spl-taskq.c | 1292 | ||||
-rw-r--r-- | module/spl/spl-thread.c | 163 | ||||
-rw-r--r-- | module/spl/spl-tsd.c | 720 | ||||
-rw-r--r-- | module/spl/spl-vmem.c | 135 | ||||
-rw-r--r-- | module/spl/spl-vnode.c | 719 | ||||
-rw-r--r-- | module/spl/spl-xdr.c | 513 | ||||
-rw-r--r-- | module/spl/spl-zlib.c | 217 |
22 files changed, 2 insertions, 9942 deletions
diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in index e16666aa9..8602f4edd 100644 --- a/module/spl/Makefile.in +++ b/module/spl/Makefile.in @@ -7,21 +7,5 @@ obj-$(CONFIG_ZFS) := $(MODULE).o ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) -$(MODULE)-objs += spl-atomic.o -$(MODULE)-objs += spl-condvar.o -$(MODULE)-objs += spl-cred.o -$(MODULE)-objs += spl-err.o -$(MODULE)-objs += spl-generic.o -$(MODULE)-objs += spl-kmem.o -$(MODULE)-objs += spl-kmem-cache.o -$(MODULE)-objs += spl-kobj.o -$(MODULE)-objs += spl-kstat.o -$(MODULE)-objs += spl-proc.o -$(MODULE)-objs += spl-procfs-list.o -$(MODULE)-objs += spl-taskq.o -$(MODULE)-objs += spl-thread.o -$(MODULE)-objs += spl-tsd.o -$(MODULE)-objs += spl-vmem.o -$(MODULE)-objs += spl-vnode.o -$(MODULE)-objs += spl-xdr.o -$(MODULE)-objs += spl-zlib.o + +-include @abs_top_builddir@/module/os/linux/spl/Makefile diff --git a/module/spl/README.md b/module/spl/README.md deleted file mode 100644 index 57f635aed..000000000 --- a/module/spl/README.md +++ /dev/null @@ -1,16 +0,0 @@ -The Solaris Porting Layer, SPL, is a Linux kernel module which provides a -compatibility layer used by the [ZFS on Linux](http://zfsonlinux.org) project. - -# Installation - -The latest version of the SPL is maintained as part of this repository. -Only when building ZFS version 0.7.x or earlier must an external SPL release -be used. These releases can be found at: - - * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release - * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release - -# Release - -The SPL is released under a GPLv2 license. -For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197` diff --git a/module/spl/THIRDPARTYLICENSE.gplv2 b/module/spl/THIRDPARTYLICENSE.gplv2 deleted file mode 100644 index d159169d1..000000000 --- a/module/spl/THIRDPARTYLICENSE.gplv2 +++ /dev/null @@ -1,339 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - <one line to give the program's name and a brief idea of what it does.> - Copyright (C) <year> <name of author> - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - <signature of Ty Coon>, 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. diff --git a/module/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/spl/THIRDPARTYLICENSE.gplv2.descrip deleted file mode 100644 index 78535a8ee..000000000 --- a/module/spl/THIRDPARTYLICENSE.gplv2.descrip +++ /dev/null @@ -1 +0,0 @@ -COMPATIBILITY LAYER FOR OPENZFS ON LINUX diff --git a/module/spl/spl-atomic.c b/module/spl/spl-atomic.c deleted file mode 100644 index 47ed1886e..000000000 --- a/module/spl/spl-atomic.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Atomic Implementation. - */ - -#include <sys/atomic.h> - -#ifdef ATOMIC_SPINLOCK -/* Global atomic lock declarations */ -DEFINE_SPINLOCK(atomic32_lock); -DEFINE_SPINLOCK(atomic64_lock); - -EXPORT_SYMBOL(atomic32_lock); -EXPORT_SYMBOL(atomic64_lock); -#endif /* ATOMIC_SPINLOCK */ diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c deleted file mode 100644 index 3cc33da62..000000000 --- a/module/spl/spl-condvar.c +++ /dev/null @@ -1,461 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Credential Implementation. - */ - -#include <sys/condvar.h> -#include <sys/time.h> -#include <sys/sysmacros.h> -#include <linux/hrtimer.h> -#include <linux/compiler_compat.h> -#include <linux/mod_compat.h> - -#include <linux/sched.h> - -#ifdef HAVE_SCHED_SIGNAL_HEADER -#include <linux/sched/signal.h> -#endif - -#define MAX_HRTIMEOUT_SLACK_US 1000 -unsigned int spl_schedule_hrtimeout_slack_us = 0; - -static int -param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) -{ - unsigned long val; - int error; - - error = kstrtoul(buf, 0, &val); - if (error) - return (error); - - if (val > MAX_HRTIMEOUT_SLACK_US) - return (-EINVAL); - - error = param_set_uint(buf, kp); - if (error < 0) - return (error); - - return (0); -} - -module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack, - param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644); -MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us, - "schedule_hrtimeout_range() delta/slack value in us, default(0)"); - -void -__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) -{ - ASSERT(cvp); - ASSERT(name == NULL); - ASSERT(type == CV_DEFAULT); - ASSERT(arg == NULL); - - cvp->cv_magic = CV_MAGIC; - init_waitqueue_head(&cvp->cv_event); - init_waitqueue_head(&cvp->cv_destroy); - atomic_set(&cvp->cv_waiters, 0); - atomic_set(&cvp->cv_refs, 1); - cvp->cv_mutex = NULL; -} -EXPORT_SYMBOL(__cv_init); - -static int -cv_destroy_wakeup(kcondvar_t *cvp) -{ - if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { - ASSERT(cvp->cv_mutex == NULL); - ASSERT(!waitqueue_active(&cvp->cv_event)); - return (1); - } - - return (0); -} - -void -__cv_destroy(kcondvar_t *cvp) -{ - ASSERT(cvp); - ASSERT(cvp->cv_magic == CV_MAGIC); - - cvp->cv_magic = CV_DESTROY; - atomic_dec(&cvp->cv_refs); - - /* Block until all waiters are woken and references dropped. */ - while (cv_destroy_wakeup(cvp) == 0) - wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); - - ASSERT3P(cvp->cv_mutex, ==, NULL); - ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); - ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); - ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); -} -EXPORT_SYMBOL(__cv_destroy); - -static void -cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io) -{ - DEFINE_WAIT(wait); - kmutex_t *m; - - ASSERT(cvp); - ASSERT(mp); - ASSERT(cvp->cv_magic == CV_MAGIC); - ASSERT(mutex_owned(mp)); - atomic_inc(&cvp->cv_refs); - - m = READ_ONCE(cvp->cv_mutex); - if (!m) - m = xchg(&cvp->cv_mutex, mp); - /* Ensure the same mutex is used by all callers */ - ASSERT(m == NULL || m == mp); - - prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); - atomic_inc(&cvp->cv_waiters); - - /* - * Mutex should be dropped after prepare_to_wait() this - * ensures we're linked in to the waiters list and avoids the - * race where 'cvp->cv_waiters > 0' but the list is empty. - */ - mutex_exit(mp); - if (io) - io_schedule(); - else - schedule(); - - /* No more waiters a different mutex could be used */ - if (atomic_dec_and_test(&cvp->cv_waiters)) { - /* - * This is set without any lock, so it's racy. But this is - * just for debug anyway, so make it best-effort - */ - cvp->cv_mutex = NULL; - wake_up(&cvp->cv_destroy); - } - - finish_wait(&cvp->cv_event, &wait); - atomic_dec(&cvp->cv_refs); - - /* - * Hold mutex after we release the cvp, otherwise we could dead lock - * with a thread holding the mutex and call cv_destroy. - */ - mutex_enter(mp); -} - -void -__cv_wait(kcondvar_t *cvp, kmutex_t *mp) -{ - cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0); -} -EXPORT_SYMBOL(__cv_wait); - -void -__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp) -{ - cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1); -} -EXPORT_SYMBOL(__cv_wait_io); - -int -__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp) -{ - cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1); - - return (signal_pending(current) ? 0 : 1); -} -EXPORT_SYMBOL(__cv_wait_io_sig); - -int -__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) -{ - cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); - - return (signal_pending(current) ? 0 : 1); -} -EXPORT_SYMBOL(__cv_wait_sig); - -#if defined(HAVE_IO_SCHEDULE_TIMEOUT) -#define spl_io_schedule_timeout(t) io_schedule_timeout(t) -#else - -struct spl_task_timer { - struct timer_list timer; - struct task_struct *task; -}; - -static void -__cv_wakeup(spl_timer_list_t t) -{ - struct timer_list *tmr = (struct timer_list *)t; - struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer); - - wake_up_process(task_timer->task); -} - -static long -spl_io_schedule_timeout(long time_left) -{ - long expire_time = jiffies + time_left; - struct spl_task_timer task_timer; - struct timer_list *timer = &task_timer.timer; - - task_timer.task = current; - - timer_setup(timer, __cv_wakeup, 0); - - timer->expires = expire_time; - add_timer(timer); - - io_schedule(); - - del_timer_sync(timer); - - time_left = expire_time - jiffies; - - return (time_left < 0 ? 0 : time_left); -} -#endif - -/* - * 'expire_time' argument is an absolute wall clock time in jiffies. - * Return value is time left (expire_time - now) or -1 if timeout occurred. - */ -static clock_t -__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, - int state, int io) -{ - DEFINE_WAIT(wait); - kmutex_t *m; - clock_t time_left; - - ASSERT(cvp); - ASSERT(mp); - ASSERT(cvp->cv_magic == CV_MAGIC); - ASSERT(mutex_owned(mp)); - - /* XXX - Does not handle jiffie wrap properly */ - time_left = expire_time - jiffies; - if (time_left <= 0) - return (-1); - - atomic_inc(&cvp->cv_refs); - m = READ_ONCE(cvp->cv_mutex); - if (!m) - m = xchg(&cvp->cv_mutex, mp); - /* Ensure the same mutex is used by all callers */ - ASSERT(m == NULL || m == mp); - - prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); - atomic_inc(&cvp->cv_waiters); - - /* - * Mutex should be dropped after prepare_to_wait() this - * ensures we're linked in to the waiters list and avoids the - * race where 'cvp->cv_waiters > 0' but the list is empty. - */ - mutex_exit(mp); - if (io) - time_left = spl_io_schedule_timeout(time_left); - else - time_left = schedule_timeout(time_left); - - /* No more waiters a different mutex could be used */ - if (atomic_dec_and_test(&cvp->cv_waiters)) { - /* - * This is set without any lock, so it's racy. But this is - * just for debug anyway, so make it best-effort - */ - cvp->cv_mutex = NULL; - wake_up(&cvp->cv_destroy); - } - - finish_wait(&cvp->cv_event, &wait); - atomic_dec(&cvp->cv_refs); - - /* - * Hold mutex after we release the cvp, otherwise we could dead lock - * with a thread holding the mutex and call cv_destroy. - */ - mutex_enter(mp); - return (time_left > 0 ? time_left : -1); -} - -clock_t -__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) -{ - return (__cv_timedwait_common(cvp, mp, exp_time, - TASK_UNINTERRUPTIBLE, 0)); -} -EXPORT_SYMBOL(__cv_timedwait); - -clock_t -__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) -{ - return (__cv_timedwait_common(cvp, mp, exp_time, - TASK_UNINTERRUPTIBLE, 1)); -} -EXPORT_SYMBOL(__cv_timedwait_io); - -clock_t -__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) -{ - return (__cv_timedwait_common(cvp, mp, exp_time, - TASK_INTERRUPTIBLE, 0)); -} -EXPORT_SYMBOL(__cv_timedwait_sig); - -/* - * 'expire_time' argument is an absolute clock time in nanoseconds. - * Return value is time left (expire_time - now) or -1 if timeout occurred. - */ -static clock_t -__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, - hrtime_t res, int state) -{ - DEFINE_WAIT(wait); - kmutex_t *m; - hrtime_t time_left; - ktime_t ktime_left; - u64 slack = 0; - - ASSERT(cvp); - ASSERT(mp); - ASSERT(cvp->cv_magic == CV_MAGIC); - ASSERT(mutex_owned(mp)); - - time_left = expire_time - gethrtime(); - if (time_left <= 0) - return (-1); - - atomic_inc(&cvp->cv_refs); - m = READ_ONCE(cvp->cv_mutex); - if (!m) - m = xchg(&cvp->cv_mutex, mp); - /* Ensure the same mutex is used by all callers */ - ASSERT(m == NULL || m == mp); - - prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); - atomic_inc(&cvp->cv_waiters); - - /* - * Mutex should be dropped after prepare_to_wait() this - * ensures we're linked in to the waiters list and avoids the - * race where 'cvp->cv_waiters > 0' but the list is empty. - */ - mutex_exit(mp); - - ktime_left = ktime_set(0, time_left); - slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC), - MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC); - schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); - - /* No more waiters a different mutex could be used */ - if (atomic_dec_and_test(&cvp->cv_waiters)) { - /* - * This is set without any lock, so it's racy. But this is - * just for debug anyway, so make it best-effort - */ - cvp->cv_mutex = NULL; - wake_up(&cvp->cv_destroy); - } - - finish_wait(&cvp->cv_event, &wait); - atomic_dec(&cvp->cv_refs); - - mutex_enter(mp); - time_left = expire_time - gethrtime(); - return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); -} - -/* - * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. - */ -static clock_t -cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, - hrtime_t res, int flag, int state) -{ - if (!(flag & CALLOUT_FLAG_ABSOLUTE)) - tim += gethrtime(); - - return (__cv_timedwait_hires(cvp, mp, tim, res, state)); -} - -clock_t -cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, - int flag) -{ - return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, - TASK_UNINTERRUPTIBLE)); -} -EXPORT_SYMBOL(cv_timedwait_hires); - -clock_t -cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, - hrtime_t res, int flag) -{ - return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, - TASK_INTERRUPTIBLE)); -} -EXPORT_SYMBOL(cv_timedwait_sig_hires); - -void -__cv_signal(kcondvar_t *cvp) -{ - ASSERT(cvp); - ASSERT(cvp->cv_magic == CV_MAGIC); - atomic_inc(&cvp->cv_refs); - - /* - * All waiters are added with WQ_FLAG_EXCLUSIVE so only one - * waiter will be set runnable with each call to wake_up(). - * Additionally wake_up() holds a spin_lock associated with - * the wait queue to ensure we don't race waking up processes. - */ - if (atomic_read(&cvp->cv_waiters) > 0) - wake_up(&cvp->cv_event); - - atomic_dec(&cvp->cv_refs); -} -EXPORT_SYMBOL(__cv_signal); - -void -__cv_broadcast(kcondvar_t *cvp) -{ - ASSERT(cvp); - ASSERT(cvp->cv_magic == CV_MAGIC); - atomic_inc(&cvp->cv_refs); - - /* - * Wake_up_all() will wake up all waiters even those which - * have the WQ_FLAG_EXCLUSIVE flag set. - */ - if (atomic_read(&cvp->cv_waiters) > 0) - wake_up_all(&cvp->cv_event); - - atomic_dec(&cvp->cv_refs); -} -EXPORT_SYMBOL(__cv_broadcast); diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c deleted file mode 100644 index ea3e903f9..000000000 --- a/module/spl/spl-cred.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Credential Implementation. - */ - -#include <sys/cred.h> - -static int -#ifdef HAVE_KUIDGID_T -cr_groups_search(const struct group_info *group_info, kgid_t grp) -#else -cr_groups_search(const struct group_info *group_info, gid_t grp) -#endif -{ - unsigned int left, right, mid; - int cmp; - - if (!group_info) - return (0); - - left = 0; - right = group_info->ngroups; - while (left < right) { - mid = (left + right) / 2; - cmp = KGID_TO_SGID(grp) - - KGID_TO_SGID(GROUP_AT(group_info, mid)); - - if (cmp > 0) - left = mid + 1; - else if (cmp < 0) - right = mid; - else - return (1); - } - return (0); -} - -/* Hold a reference on the credential */ -void -crhold(cred_t *cr) -{ - (void) get_cred((const cred_t *)cr); -} - -/* Free a reference on the credential */ -void -crfree(cred_t *cr) -{ - put_cred((const cred_t *)cr); -} - -/* Return the number of supplemental groups */ -int -crgetngroups(const cred_t *cr) -{ - struct group_info *gi; - int rc; - - gi = cr->group_info; - rc = gi->ngroups; -#ifndef HAVE_GROUP_INFO_GID - /* - * For Linux <= 4.8, - * crgetgroups will only returns gi->blocks[0], which contains only - * the first NGROUPS_PER_BLOCK groups. - */ - if (rc > NGROUPS_PER_BLOCK) { - WARN_ON_ONCE(1); - rc = NGROUPS_PER_BLOCK; - } -#endif - return (rc); -} - -/* - * Return an array of supplemental gids. The returned address is safe - * to use as long as the caller has taken a reference with crhold(). - * - * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d - * array via ->gid. - */ -gid_t * -crgetgroups(const cred_t *cr) -{ - struct group_info *gi; - gid_t *gids = NULL; - - gi = cr->group_info; -#ifdef HAVE_GROUP_INFO_GID - gids = KGIDP_TO_SGIDP(gi->gid); -#else - if (gi->nblocks > 0) - gids = KGIDP_TO_SGIDP(gi->blocks[0]); -#endif - return (gids); -} - -/* Check if the passed gid is available in supplied credential. */ -int -groupmember(gid_t gid, const cred_t *cr) -{ - struct group_info *gi; - int rc; - - gi = cr->group_info; - rc = cr_groups_search(gi, SGID_TO_KGID(gid)); - - return (rc); -} - -/* Return the effective user id */ -uid_t -crgetuid(const cred_t *cr) -{ - return (KUID_TO_SUID(cr->euid)); -} - -/* Return the real user id */ -uid_t -crgetruid(const cred_t *cr) -{ - return (KUID_TO_SUID(cr->uid)); -} - -/* Return the saved user id */ -uid_t -crgetsuid(const cred_t *cr) -{ - return (KUID_TO_SUID(cr->suid)); -} - -/* Return the filesystem user id */ -uid_t -crgetfsuid(const cred_t *cr) -{ - return (KUID_TO_SUID(cr->fsuid)); -} - -/* Return the effective group id */ -gid_t -crgetgid(const cred_t *cr) -{ - return (KGID_TO_SGID(cr->egid)); -} - -/* Return the real group id */ -gid_t -crgetrgid(const cred_t *cr) -{ - return (KGID_TO_SGID(cr->gid)); -} - -/* Return the saved group id */ -gid_t -crgetsgid(const cred_t *cr) -{ - return (KGID_TO_SGID(cr->sgid)); -} - -/* Return the filesystem group id */ -gid_t -crgetfsgid(const cred_t *cr) -{ - return (KGID_TO_SGID(cr->fsgid)); -} - -EXPORT_SYMBOL(crhold); -EXPORT_SYMBOL(crfree); -EXPORT_SYMBOL(crgetuid); -EXPORT_SYMBOL(crgetruid); -EXPORT_SYMBOL(crgetsuid); -EXPORT_SYMBOL(crgetfsuid); -EXPORT_SYMBOL(crgetgid); -EXPORT_SYMBOL(crgetrgid); -EXPORT_SYMBOL(crgetsgid); -EXPORT_SYMBOL(crgetfsgid); -EXPORT_SYMBOL(crgetngroups); -EXPORT_SYMBOL(crgetgroups); -EXPORT_SYMBOL(groupmember); diff --git a/module/spl/spl-err.c b/module/spl/spl-err.c deleted file mode 100644 index 3c0bb71c0..000000000 --- a/module/spl/spl-err.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Error Implementation. - */ - -#include <sys/sysmacros.h> -#include <sys/cmn_err.h> - -/* - * It is often useful to actually have the panic crash the node so you - * can then get notified of the event, get the crashdump for later - * analysis and other such goodies. - * But we would still default to the current default of not to do that. - */ -/* BEGIN CSTYLED */ -unsigned int spl_panic_halt; -module_param(spl_panic_halt, uint, 0644); -MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); -/* END CSTYLED */ - -void -spl_dumpstack(void) -{ - printk("Showing stack for process %d\n", current->pid); - dump_stack(); -} -EXPORT_SYMBOL(spl_dumpstack); - -int -spl_panic(const char *file, const char *func, int line, const char *fmt, ...) -{ - const char *newfile; - char msg[MAXMSGLEN]; - va_list ap; - - newfile = strrchr(file, '/'); - if (newfile != NULL) - newfile = newfile + 1; - else - newfile = file; - - va_start(ap, fmt); - (void) vsnprintf(msg, sizeof (msg), fmt, ap); - va_end(ap); - - printk(KERN_EMERG "%s", msg); - printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func); - if (spl_panic_halt) - panic("%s", msg); - - spl_dumpstack(); - - /* Halt the thread to facilitate further debugging */ - set_current_state(TASK_UNINTERRUPTIBLE); - while (1) - schedule(); - - /* Unreachable */ - return (1); -} -EXPORT_SYMBOL(spl_panic); - -void -vcmn_err(int ce, const char *fmt, va_list ap) -{ - char msg[MAXMSGLEN]; - - vsnprintf(msg, MAXMSGLEN, fmt, ap); - - switch (ce) { - case CE_IGNORE: - break; - case CE_CONT: - printk("%s", msg); - break; - case CE_NOTE: - printk(KERN_NOTICE "NOTICE: %s\n", msg); - break; - case CE_WARN: - printk(KERN_WARNING "WARNING: %s\n", msg); - break; - case CE_PANIC: - printk(KERN_EMERG "PANIC: %s\n", msg); - spl_dumpstack(); - - /* Halt the thread to facilitate further debugging */ - set_current_state(TASK_UNINTERRUPTIBLE); - while (1) - schedule(); - } -} /* vcmn_err() */ -EXPORT_SYMBOL(vcmn_err); - -void -cmn_err(int ce, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - vcmn_err(ce, fmt, ap); - va_end(ap); -} /* cmn_err() */ -EXPORT_SYMBOL(cmn_err); diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c deleted file mode 100644 index 1deb2f444..000000000 --- a/module/spl/spl-generic.c +++ /dev/null @@ -1,757 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Generic Implementation. - */ - -#include <sys/sysmacros.h> -#include <sys/systeminfo.h> -#include <sys/vmsystm.h> -#include <sys/kobj.h> -#include <sys/kmem.h> -#include <sys/kmem_cache.h> -#include <sys/vmem.h> -#include <sys/mutex.h> -#include <sys/rwlock.h> -#include <sys/taskq.h> -#include <sys/tsd.h> -#include <sys/zmod.h> -#include <sys/debug.h> -#include <sys/proc.h> -#include <sys/kstat.h> -#include <sys/file.h> -#include <linux/ctype.h> -#include <sys/disp.h> -#include <sys/random.h> -#include <sys/strings.h> -#include <linux/kmod.h> -#include "zfs_gitrev.h" - -char spl_gitrev[64] = ZFS_META_GITREV; - -/* BEGIN CSTYLED */ -unsigned long spl_hostid = 0; -EXPORT_SYMBOL(spl_hostid); -/* BEGIN CSTYLED */ -module_param(spl_hostid, ulong, 0644); -MODULE_PARM_DESC(spl_hostid, "The system hostid."); -/* END CSTYLED */ - -proc_t p0; -EXPORT_SYMBOL(p0); - -/* - * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna - * - * "Further scramblings of Marsaglia's xorshift generators" - * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf - * - * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose - * is to provide bytes containing random numbers. It is mapped to /dev/urandom - * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's - * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so - * we can implement it using a fast PRNG that we seed using Linux' actual - * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU - * with an independent seed so that all calls to random_get_pseudo_bytes() are - * free of atomic instructions. - * - * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() - * to generate words larger than 128 bits will paradoxically be limited to - * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` - * 128-bit words and selecting the first will implicitly select the second. If - * a caller finds this behavior undesirable, random_get_bytes() should be used - * instead. - * - * XXX: Linux interrupt handlers that trigger within the critical section - * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will - * see the same numbers. Nothing in the code currently calls this in an - * interrupt handler, so this is considered to be okay. If that becomes a - * problem, we could create a set of per-cpu variables for interrupt handlers - * and use them when in_interrupt() from linux/preempt_mask.h evaluates to - * true. - */ -static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy); - -/* - * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed - * file: - * - * http://xorshift.di.unimi.it/xorshift128plus.c - */ - -static inline uint64_t -spl_rand_next(uint64_t *s) -{ - uint64_t s1 = s[0]; - const uint64_t s0 = s[1]; - s[0] = s0; - s1 ^= s1 << 23; // a - s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c - return (s[1] + s0); -} - -static inline void -spl_rand_jump(uint64_t *s) -{ - static const uint64_t JUMP[] = - { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; - - uint64_t s0 = 0; - uint64_t s1 = 0; - int i, b; - for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) - for (b = 0; b < 64; b++) { - if (JUMP[i] & 1ULL << b) { - s0 ^= s[0]; - s1 ^= s[1]; - } - (void) spl_rand_next(s); - } - - s[0] = s0; - s[1] = s1; -} - -int -random_get_pseudo_bytes(uint8_t *ptr, size_t len) -{ - uint64_t *xp, s[2]; - - ASSERT(ptr); - - xp = get_cpu_var(spl_pseudo_entropy); - - s[0] = xp[0]; - s[1] = xp[1]; - - while (len) { - union { - uint64_t ui64; - uint8_t byte[sizeof (uint64_t)]; - }entropy; - int i = MIN(len, sizeof (uint64_t)); - - len -= i; - entropy.ui64 = spl_rand_next(s); - - while (i--) - *ptr++ = entropy.byte[i]; - } - - xp[0] = s[0]; - xp[1] = s[1]; - - put_cpu_var(spl_pseudo_entropy); - - return (0); -} - - -EXPORT_SYMBOL(random_get_pseudo_bytes); - -#if BITS_PER_LONG == 32 -/* - * Support 64/64 => 64 division on a 32-bit platform. While the kernel - * provides a div64_u64() function for this we do not use it because the - * implementation is flawed. There are cases which return incorrect - * results as late as linux-2.6.35. Until this is fixed upstream the - * spl must provide its own implementation. - * - * This implementation is a slightly modified version of the algorithm - * proposed by the book 'Hacker's Delight'. The original source can be - * found here and is available for use without restriction. - * - * http://www.hackersdelight.org/HDcode/newCode/divDouble.c - */ - -/* - * Calculate number of leading of zeros for a 64-bit value. - */ -static int -nlz64(uint64_t x) -{ - register int n = 0; - - if (x == 0) - return (64); - - if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } - if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } - if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } - if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } - if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } - if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } - - return (n); -} - -/* - * Newer kernels have a div_u64() function but we define our own - * to simplify portability between kernel versions. - */ -static inline uint64_t -__div_u64(uint64_t u, uint32_t v) -{ - (void) do_div(u, v); - return (u); -} - -/* - * Implementation of 64-bit unsigned division for 32-bit machines. - * - * First the procedure takes care of the case in which the divisor is a - * 32-bit quantity. There are two subcases: (1) If the left half of the - * dividend is less than the divisor, one execution of do_div() is all that - * is required (overflow is not possible). (2) Otherwise it does two - * divisions, using the grade school method. - */ -uint64_t -__udivdi3(uint64_t u, uint64_t v) -{ - uint64_t u0, u1, v1, q0, q1, k; - int n; - - if (v >> 32 == 0) { // If v < 2**32: - if (u >> 32 < v) { // If u/v cannot overflow, - return (__div_u64(u, v)); // just do one division. - } else { // If u/v would overflow: - u1 = u >> 32; // Break u into two halves. - u0 = u & 0xFFFFFFFF; - q1 = __div_u64(u1, v); // First quotient digit. - k = u1 - q1 * v; // First remainder, < v. - u0 += (k << 32); - q0 = __div_u64(u0, v); // Seconds quotient digit. - return ((q1 << 32) + q0); - } - } else { // If v >= 2**32: - n = nlz64(v); // 0 <= n <= 31. - v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. - u1 = u >> 1; // To ensure no overflow. - q1 = __div_u64(u1, v1); // Get quotient from - q0 = (q1 << n) >> 31; // Undo normalization and - // division of u by 2. - if (q0 != 0) // Make q0 correct or - q0 = q0 - 1; // too small by 1. - if ((u - q0 * v) >= v) - q0 = q0 + 1; // Now q0 is correct. - - return (q0); - } -} -EXPORT_SYMBOL(__udivdi3); - -/* BEGIN CSTYLED */ -#ifndef abs64 -#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) -#endif -/* END CSTYLED */ - -/* - * Implementation of 64-bit signed division for 32-bit machines. - */ -int64_t -__divdi3(int64_t u, int64_t v) -{ - int64_t q, t; - q = __udivdi3(abs64(u), abs64(v)); - t = (u ^ v) >> 63; // If u, v have different - return ((q ^ t) - t); // signs, negate q. -} -EXPORT_SYMBOL(__divdi3); - -/* - * Implementation of 64-bit unsigned modulo for 32-bit machines. - */ -uint64_t -__umoddi3(uint64_t dividend, uint64_t divisor) -{ - return (dividend - (divisor * __udivdi3(dividend, divisor))); -} -EXPORT_SYMBOL(__umoddi3); - -/* - * Implementation of 64-bit unsigned division/modulo for 32-bit machines. - */ -uint64_t -__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) -{ - uint64_t q = __udivdi3(n, d); - if (r) - *r = n - d * q; - return (q); -} -EXPORT_SYMBOL(__udivmoddi4); - -/* - * Implementation of 64-bit signed division/modulo for 32-bit machines. - */ -int64_t -__divmoddi4(int64_t n, int64_t d, int64_t *r) -{ - int64_t q, rr; - boolean_t nn = B_FALSE; - boolean_t nd = B_FALSE; - if (n < 0) { - nn = B_TRUE; - n = -n; - } - if (d < 0) { - nd = B_TRUE; - d = -d; - } - - q = __udivmoddi4(n, d, (uint64_t *)&rr); - - if (nn != nd) - q = -q; - if (nn) - rr = -rr; - if (r) - *r = rr; - return (q); -} -EXPORT_SYMBOL(__divmoddi4); - -#if defined(__arm) || defined(__arm__) -/* - * Implementation of 64-bit (un)signed division for 32-bit arm machines. - * - * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) - * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, - * and the remainder in {r2, r3}. The return type is specifically left - * set to 'void' to ensure the compiler does not overwrite these registers - * during the return. All results are in registers as per ABI - */ -void -__aeabi_uldivmod(uint64_t u, uint64_t v) -{ - uint64_t res; - uint64_t mod; - - res = __udivdi3(u, v); - mod = __umoddi3(u, v); - { - register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); - register uint32_t r1 asm("r1") = (res >> 32); - register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); - register uint32_t r3 asm("r3") = (mod >> 32); - - /* BEGIN CSTYLED */ - asm volatile("" - : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ - : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ - /* END CSTYLED */ - - return; /* r0; */ - } -} -EXPORT_SYMBOL(__aeabi_uldivmod); - -void -__aeabi_ldivmod(int64_t u, int64_t v) -{ - int64_t res; - uint64_t mod; - - res = __divdi3(u, v); - mod = __umoddi3(u, v); - { - register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); - register uint32_t r1 asm("r1") = (res >> 32); - register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); - register uint32_t r3 asm("r3") = (mod >> 32); - - /* BEGIN CSTYLED */ - asm volatile("" - : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ - : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ - /* END CSTYLED */ - - return; /* r0; */ - } -} -EXPORT_SYMBOL(__aeabi_ldivmod); -#endif /* __arm || __arm__ */ -#endif /* BITS_PER_LONG */ - -/* - * NOTE: The strtoxx behavior is solely based on my reading of the Solaris - * ddi_strtol(9F) man page. I have not verified the behavior of these - * functions against their Solaris counterparts. It is possible that I - * may have misinterpreted the man page or the man page is incorrect. - */ -int ddi_strtoul(const char *, char **, int, unsigned long *); -int ddi_strtol(const char *, char **, int, long *); -int ddi_strtoull(const char *, char **, int, unsigned long long *); -int ddi_strtoll(const char *, char **, int, long long *); - -#define define_ddi_strtoux(type, valtype) \ -int ddi_strtou##type(const char *str, char **endptr, \ - int base, valtype *result) \ -{ \ - valtype last_value, value = 0; \ - char *ptr = (char *)str; \ - int flag = 1, digit; \ - \ - if (strlen(ptr) == 0) \ - return (EINVAL); \ - \ - /* Auto-detect base based on prefix */ \ - if (!base) { \ - if (str[0] == '0') { \ - if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ - base = 16; /* hex */ \ - ptr += 2; \ - } else if (str[1] >= '0' && str[1] < 8) { \ - base = 8; /* octal */ \ - ptr += 1; \ - } else { \ - return (EINVAL); \ - } \ - } else { \ - base = 10; /* decimal */ \ - } \ - } \ - \ - while (1) { \ - if (isdigit(*ptr)) \ - digit = *ptr - '0'; \ - else if (isalpha(*ptr)) \ - digit = tolower(*ptr) - 'a' + 10; \ - else \ - break; \ - \ - if (digit >= base) \ - break; \ - \ - last_value = value; \ - value = value * base + digit; \ - if (last_value > value) /* Overflow */ \ - return (ERANGE); \ - \ - flag = 1; \ - ptr++; \ - } \ - \ - if (flag) \ - *result = value; \ - \ - if (endptr) \ - *endptr = (char *)(flag ? ptr : str); \ - \ - return (0); \ -} \ - -#define define_ddi_strtox(type, valtype) \ -int ddi_strto##type(const char *str, char **endptr, \ - int base, valtype *result) \ -{ \ - int rc; \ - \ - if (*str == '-') { \ - rc = ddi_strtou##type(str + 1, endptr, base, result); \ - if (!rc) { \ - if (*endptr == str + 1) \ - *endptr = (char *)str; \ - else \ - *result = -*result; \ - } \ - } else { \ - rc = ddi_strtou##type(str, endptr, base, result); \ - } \ - \ - return (rc); \ -} - -define_ddi_strtoux(l, unsigned long) -define_ddi_strtox(l, long) -define_ddi_strtoux(ll, unsigned long long) -define_ddi_strtox(ll, long long) - -EXPORT_SYMBOL(ddi_strtoul); -EXPORT_SYMBOL(ddi_strtol); -EXPORT_SYMBOL(ddi_strtoll); -EXPORT_SYMBOL(ddi_strtoull); - -int -ddi_copyin(const void *from, void *to, size_t len, int flags) -{ - /* Fake ioctl() issued by kernel, 'from' is a kernel address */ - if (flags & FKIOCTL) { - memcpy(to, from, len); - return (0); - } - - return (copyin(from, to, len)); -} -EXPORT_SYMBOL(ddi_copyin); - -int -ddi_copyout(const void *from, void *to, size_t len, int flags) -{ - /* Fake ioctl() issued by kernel, 'from' is a kernel address */ - if (flags & FKIOCTL) { - memcpy(to, from, len); - return (0); - } - - return (copyout(from, to, len)); -} -EXPORT_SYMBOL(ddi_copyout); - -/* - * Read the unique system identifier from the /etc/hostid file. - * - * The behavior of /usr/bin/hostid on Linux systems with the - * regular eglibc and coreutils is: - * - * 1. Generate the value if the /etc/hostid file does not exist - * or if the /etc/hostid file is less than four bytes in size. - * - * 2. If the /etc/hostid file is at least 4 bytes, then return - * the first four bytes [0..3] in native endian order. - * - * 3. Always ignore bytes [4..] if they exist in the file. - * - * Only the first four bytes are significant, even on systems that - * have a 64-bit word size. - * - * See: - * - * eglibc: sysdeps/unix/sysv/linux/gethostid.c - * coreutils: src/hostid.c - * - * Notes: - * - * The /etc/hostid file on Solaris is a text file that often reads: - * - * # DO NOT EDIT - * "0123456789" - * - * Directly copying this file to Linux results in a constant - * hostid of 4f442023 because the default comment constitutes - * the first four bytes of the file. - * - */ - -char *spl_hostid_path = HW_HOSTID_PATH; -module_param(spl_hostid_path, charp, 0444); -MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); - -static int -hostid_read(uint32_t *hostid) -{ - uint64_t size; - struct _buf *file; - uint32_t value = 0; - int error; - - file = kobj_open_file(spl_hostid_path); - if (file == (struct _buf *)-1) - return (ENOENT); - - error = kobj_get_filesize(file, &size); - if (error) { - kobj_close_file(file); - return (error); - } - - if (size < sizeof (HW_HOSTID_MASK)) { - kobj_close_file(file); - return (EINVAL); - } - - /* - * Read directly into the variable like eglibc does. - * Short reads are okay; native behavior is preserved. - */ - error = kobj_read_file(file, (char *)&value, sizeof (value), 0); - if (error < 0) { - kobj_close_file(file); - return (EIO); - } - - /* Mask down to 32 bits like coreutils does. */ - *hostid = (value & HW_HOSTID_MASK); - kobj_close_file(file); - - return (0); -} - -/* - * Return the system hostid. Preferentially use the spl_hostid module option - * when set, otherwise use the value in the /etc/hostid file. - */ -uint32_t -zone_get_hostid(void *zone) -{ - uint32_t hostid; - - ASSERT3P(zone, ==, NULL); - - if (spl_hostid != 0) - return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); - - if (hostid_read(&hostid) == 0) - return (hostid); - - return (0); -} -EXPORT_SYMBOL(zone_get_hostid); - -static int -spl_kvmem_init(void) -{ - int rc = 0; - - rc = spl_kmem_init(); - if (rc) - return (rc); - - rc = spl_vmem_init(); - if (rc) { - spl_kmem_fini(); - return (rc); - } - - return (rc); -} - -/* - * We initialize the random number generator with 128 bits of entropy from the - * system random number generator. In the improbable case that we have a zero - * seed, we fallback to the system jiffies, unless it is also zero, in which - * situation we use a preprogrammed seed. We step forward by 2^64 iterations to - * initialize each of the per-cpu seeds so that the sequences generated on each - * CPU are guaranteed to never overlap in practice. - */ -static void __init -spl_random_init(void) -{ - uint64_t s[2]; - int i; - - get_random_bytes(s, sizeof (s)); - - if (s[0] == 0 && s[1] == 0) { - if (jiffies != 0) { - s[0] = jiffies; - s[1] = ~0 - jiffies; - } else { - (void) memcpy(s, "improbable seed", sizeof (s)); - } - printk("SPL: get_random_bytes() returned 0 " - "when generating random seed. Setting initial seed to " - "0x%016llx%016llx.\n", cpu_to_be64(s[0]), - cpu_to_be64(s[1])); - } - - for_each_possible_cpu(i) { - uint64_t *wordp = per_cpu(spl_pseudo_entropy, i); - - spl_rand_jump(s); - - wordp[0] = s[0]; - wordp[1] = s[1]; - } -} - -static void -spl_kvmem_fini(void) -{ - spl_vmem_fini(); - spl_kmem_fini(); -} - -static int __init -spl_init(void) -{ - int rc = 0; - - bzero(&p0, sizeof (proc_t)); - spl_random_init(); - - if ((rc = spl_kvmem_init())) - goto out1; - - if ((rc = spl_tsd_init())) - goto out2; - - if ((rc = spl_taskq_init())) - goto out3; - - if ((rc = spl_kmem_cache_init())) - goto out4; - - if ((rc = spl_vn_init())) - goto out5; - - if ((rc = spl_proc_init())) - goto out6; - - if ((rc = spl_kstat_init())) - goto out7; - - if ((rc = spl_zlib_init())) - goto out8; - - return (rc); - -out8: - spl_kstat_fini(); -out7: - spl_proc_fini(); -out6: - spl_vn_fini(); -out5: - spl_kmem_cache_fini(); -out4: - spl_taskq_fini(); -out3: - spl_tsd_fini(); -out2: - spl_kvmem_fini(); -out1: - return (rc); -} - -static void __exit -spl_fini(void) -{ - spl_zlib_fini(); - spl_kstat_fini(); - spl_proc_fini(); - spl_vn_fini(); - spl_kmem_cache_fini(); - spl_taskq_fini(); - spl_tsd_fini(); - spl_kvmem_fini(); -} - -module_init(spl_init); -module_exit(spl_fini); - -MODULE_DESCRIPTION("Solaris Porting Layer"); -MODULE_AUTHOR(ZFS_META_AUTHOR); -MODULE_LICENSE("GPL"); -MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c deleted file mode 100644 index b39867b03..000000000 --- a/module/spl/spl-kmem-cache.c +++ /dev/null @@ -1,1780 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <sys/kmem.h> -#include <sys/kmem_cache.h> -#include <sys/shrinker.h> -#include <sys/taskq.h> -#include <sys/timer.h> -#include <sys/vmem.h> -#include <sys/wait.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/prefetch.h> - -/* - * Within the scope of spl-kmem.c file the kmem_cache_* definitions - * are removed to allow access to the real Linux slab allocator. - */ -#undef kmem_cache_destroy -#undef kmem_cache_create -#undef kmem_cache_alloc -#undef kmem_cache_free - - -/* - * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() - * with smp_mb__{before,after}_atomic() because they were redundant. This is - * only used inside our SLAB allocator, so we implement an internal wrapper - * here to give us smp_mb__{before,after}_atomic() on older kernels. - */ -#ifndef smp_mb__before_atomic -#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) -#endif - -#ifndef smp_mb__after_atomic -#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) -#endif - -/* - * Cache expiration was implemented because it was part of the default Solaris - * kmem_cache behavior. The idea is that per-cpu objects which haven't been - * accessed in several seconds should be returned to the cache. On the other - * hand Linux slabs never move objects back to the slabs unless there is - * memory pressure on the system. By default the Linux method is enabled - * because it has been shown to improve responsiveness on low memory systems. - * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. - */ -/* BEGIN CSTYLED */ -unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; -EXPORT_SYMBOL(spl_kmem_cache_expire); -module_param(spl_kmem_cache_expire, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); - -/* - * Cache magazines are an optimization designed to minimize the cost of - * allocating memory. They do this by keeping a per-cpu cache of recently - * freed objects, which can then be reallocated without taking a lock. This - * can improve performance on highly contended caches. However, because - * objects in magazines will prevent otherwise empty slabs from being - * immediately released this may not be ideal for low memory machines. - * - * For this reason spl_kmem_cache_magazine_size can be used to set a maximum - * magazine size. When this value is set to 0 the magazine size will be - * automatically determined based on the object size. Otherwise magazines - * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines - * may never be entirely disabled in this implementation. - */ -unsigned int spl_kmem_cache_magazine_size = 0; -module_param(spl_kmem_cache_magazine_size, uint, 0444); -MODULE_PARM_DESC(spl_kmem_cache_magazine_size, - "Default magazine size (2-256), set automatically (0)"); - -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. - */ -unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); - -unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; -module_param(spl_kmem_cache_obj_per_slab, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); - -unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; -module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, - "Minimal number of objects per slab"); - -unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; -module_param(spl_kmem_cache_max_size, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); - -/* - * For small objects the Linux slab allocator should be used to make the most - * efficient use of the memory. However, large objects are not supported by - * the Linux slab and therefore the SPL implementation is preferred. A cutoff - * of 16K was determined to be optimal for architectures using 4K pages. - */ -#if PAGE_SIZE == 4096 -unsigned int spl_kmem_cache_slab_limit = 16384; -#else -unsigned int spl_kmem_cache_slab_limit = 0; -#endif -module_param(spl_kmem_cache_slab_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_slab_limit, - "Objects less than N bytes use the Linux slab"); - -/* - * This value defaults to a threshold designed to avoid allocations which - * have been deemed costly by the kernel. - */ -unsigned int spl_kmem_cache_kmem_limit = - ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / - SPL_KMEM_CACHE_OBJ_PER_SLAB; -module_param(spl_kmem_cache_kmem_limit, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, - "Objects less than N bytes use the kmalloc"); - -/* - * The number of threads available to allocate new slabs for caches. This - * should not need to be tuned but it is available for performance analysis. - */ -unsigned int spl_kmem_cache_kmem_threads = 4; -module_param(spl_kmem_cache_kmem_threads, uint, 0444); -MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, - "Number of spl_kmem_cache threads"); -/* END CSTYLED */ - -/* - * Slab allocation interfaces - * - * While the Linux slab implementation was inspired by the Solaris - * implementation I cannot use it to emulate the Solaris APIs. I - * require two features which are not provided by the Linux slab. - * - * 1) Constructors AND destructors. Recent versions of the Linux - * kernel have removed support for destructors. This is a deal - * breaker for the SPL which contains particularly expensive - * initializers for mutex's, condition variables, etc. We also - * require a minimal level of cleanup for these data types unlike - * many Linux data types which do need to be explicitly destroyed. - * - * 2) Virtual address space backed slab. Callers of the Solaris slab - * expect it to work well for both small are very large allocations. - * Because of memory fragmentation the Linux slab which is backed - * by kmalloc'ed memory performs very badly when confronted with - * large numbers of large allocations. Basing the slab on the - * virtual address space removes the need for contiguous pages - * and greatly improve performance for large allocations. - * - * For these reasons, the SPL has its own slab implementation with - * the needed features. It is not as highly optimized as either the - * Solaris or Linux slabs, but it should get me most of what is - * needed until it can be optimized or obsoleted by another approach. - * - * One serious concern I do have about this method is the relatively - * small virtual address space on 32bit arches. This will seriously - * constrain the size of the slab caches and their performance. - */ - -struct list_head spl_kmem_cache_list; /* List of caches */ -struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ -taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ - -static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); - -SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); -SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, - spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); - -static void * -kv_alloc(spl_kmem_cache_t *skc, int size, int flags) -{ - gfp_t lflags = kmem_flags_convert(flags); - void *ptr; - - if (skc->skc_flags & KMC_KMEM) { - ASSERT(ISP2(size)); - ptr = (void *)__get_free_pages(lflags, get_order(size)); - } else { - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); - } - - /* Resulting allocated memory will be page aligned */ - ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); - - return (ptr); -} - -static void -kv_free(spl_kmem_cache_t *skc, void *ptr, int size) -{ - ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); - - /* - * The Linux direct reclaim path uses this out of band value to - * determine if forward progress is being made. Normally this is - * incremented by kmem_freepages() which is part of the various - * Linux slab implementations. However, since we are using none - * of that infrastructure we are responsible for incrementing it. - */ - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - - if (skc->skc_flags & KMC_KMEM) { - ASSERT(ISP2(size)); - free_pages((unsigned long)ptr, get_order(size)); - } else { - vfree(ptr); - } -} - -/* - * Required space for each aligned sks. - */ -static inline uint32_t -spl_sks_size(spl_kmem_cache_t *skc) -{ - return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), - skc->skc_obj_align, uint32_t)); -} - -/* - * Required space for each aligned object. - */ -static inline uint32_t -spl_obj_size(spl_kmem_cache_t *skc) -{ - uint32_t align = skc->skc_obj_align; - - return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + - P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); -} - -/* - * Lookup the spl_kmem_object_t for an object given that object. - */ -static inline spl_kmem_obj_t * -spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) -{ - return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, - skc->skc_obj_align, uint32_t)); -} - -/* - * Required space for each offslab object taking in to account alignment - * restrictions and the power-of-two requirement of kv_alloc(). - */ -static inline uint32_t -spl_offslab_size(spl_kmem_cache_t *skc) -{ - return (1UL << (fls64(spl_obj_size(skc)) + 1)); -} - -/* - * It's important that we pack the spl_kmem_obj_t structure and the - * actual objects in to one large address space to minimize the number - * of calls to the allocator. It is far better to do a few large - * allocations and then subdivide it ourselves. Now which allocator - * we use requires balancing a few trade offs. - * - * For small objects we use kmem_alloc() because as long as you are - * only requesting a small number of pages (ideally just one) its cheap. - * However, when you start requesting multiple pages with kmem_alloc() - * it gets increasingly expensive since it requires contiguous pages. - * For this reason we shift to vmem_alloc() for slabs of large objects - * which removes the need for contiguous pages. We do not use - * vmem_alloc() in all cases because there is significant locking - * overhead in __get_vm_area_node(). This function takes a single - * global lock when acquiring an available virtual address range which - * serializes all vmem_alloc()'s for all slab caches. Using slightly - * different allocation functions for small and large objects should - * give us the best of both worlds. - * - * KMC_ONSLAB KMC_OFFSLAB - * - * +------------------------+ +-----------------+ - * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ - * | skc_obj_size <-+ | | +-----------------+ | | - * | spl_kmem_obj_t | | | | - * | skc_obj_size <---+ | +-----------------+ | | - * | spl_kmem_obj_t | | | skc_obj_size | <-+ | - * | ... v | | spl_kmem_obj_t | | - * +------------------------+ +-----------------+ v - */ -static spl_kmem_slab_t * -spl_slab_alloc(spl_kmem_cache_t *skc, int flags) -{ - spl_kmem_slab_t *sks; - spl_kmem_obj_t *sko, *n; - void *base, *obj; - uint32_t obj_size, offslab_size = 0; - int i, rc = 0; - - base = kv_alloc(skc, skc->skc_slab_size, flags); - if (base == NULL) - return (NULL); - - sks = (spl_kmem_slab_t *)base; - sks->sks_magic = SKS_MAGIC; - sks->sks_objs = skc->skc_slab_objs; - sks->sks_age = jiffies; - sks->sks_cache = skc; - INIT_LIST_HEAD(&sks->sks_list); - INIT_LIST_HEAD(&sks->sks_free_list); - sks->sks_ref = 0; - obj_size = spl_obj_size(skc); - - if (skc->skc_flags & KMC_OFFSLAB) - offslab_size = spl_offslab_size(skc); - - for (i = 0; i < sks->sks_objs; i++) { - if (skc->skc_flags & KMC_OFFSLAB) { - obj = kv_alloc(skc, offslab_size, flags); - if (!obj) { - rc = -ENOMEM; - goto out; - } - } else { - obj = base + spl_sks_size(skc) + (i * obj_size); - } - - ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - sko = spl_sko_from_obj(skc, obj); - sko->sko_addr = obj; - sko->sko_magic = SKO_MAGIC; - sko->sko_slab = sks; - INIT_LIST_HEAD(&sko->sko_list); - list_add_tail(&sko->sko_list, &sks->sks_free_list); - } - -out: - if (rc) { - if (skc->skc_flags & KMC_OFFSLAB) - list_for_each_entry_safe(sko, - n, &sks->sks_free_list, sko_list) { - kv_free(skc, sko->sko_addr, offslab_size); - } - - kv_free(skc, base, skc->skc_slab_size); - sks = NULL; - } - - return (sks); -} - -/* - * Remove a slab from complete or partial list, it must be called with - * the 'skc->skc_lock' held but the actual free must be performed - * outside the lock to prevent deadlocking on vmem addresses. - */ -static void -spl_slab_free(spl_kmem_slab_t *sks, - struct list_head *sks_list, struct list_head *sko_list) -{ - spl_kmem_cache_t *skc; - - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref == 0); - - skc = sks->sks_cache; - ASSERT(skc->skc_magic == SKC_MAGIC); - - /* - * Update slab/objects counters in the cache, then remove the - * slab from the skc->skc_partial_list. Finally add the slab - * and all its objects in to the private work lists where the - * destructors will be called and the memory freed to the system. - */ - skc->skc_obj_total -= sks->sks_objs; - skc->skc_slab_total--; - list_del(&sks->sks_list); - list_add(&sks->sks_list, sks_list); - list_splice_init(&sks->sks_free_list, sko_list); -} - -/* - * Reclaim empty slabs at the end of the partial list. - */ -static void -spl_slab_reclaim(spl_kmem_cache_t *skc) -{ - spl_kmem_slab_t *sks, *m; - spl_kmem_obj_t *sko, *n; - LIST_HEAD(sks_list); - LIST_HEAD(sko_list); - uint32_t size = 0; - - /* - * Empty slabs and objects must be moved to a private list so they - * can be safely freed outside the spin lock. All empty slabs are - * at the end of skc->skc_partial_list, therefore once a non-empty - * slab is found we can stop scanning. - */ - spin_lock(&skc->skc_lock); - list_for_each_entry_safe_reverse(sks, m, - &skc->skc_partial_list, sks_list) { - - if (sks->sks_ref > 0) - break; - - spl_slab_free(sks, &sks_list, &sko_list); - } - spin_unlock(&skc->skc_lock); - - /* - * The following two loops ensure all the object destructors are - * run, any offslab objects are freed, and the slabs themselves - * are freed. This is all done outside the skc->skc_lock since - * this allows the destructor to sleep, and allows us to perform - * a conditional reschedule when a freeing a large number of - * objects and slabs back to the system. - */ - if (skc->skc_flags & KMC_OFFSLAB) - size = spl_offslab_size(skc); - - list_for_each_entry_safe(sko, n, &sko_list, sko_list) { - ASSERT(sko->sko_magic == SKO_MAGIC); - - if (skc->skc_flags & KMC_OFFSLAB) - kv_free(skc, sko->sko_addr, size); - } - - list_for_each_entry_safe(sks, m, &sks_list, sks_list) { - ASSERT(sks->sks_magic == SKS_MAGIC); - kv_free(skc, sks, skc->skc_slab_size); - } -} - -static spl_kmem_emergency_t * -spl_emergency_search(struct rb_root *root, void *obj) -{ - struct rb_node *node = root->rb_node; - spl_kmem_emergency_t *ske; - unsigned long address = (unsigned long)obj; - - while (node) { - ske = container_of(node, spl_kmem_emergency_t, ske_node); - - if (address < ske->ske_obj) - node = node->rb_left; - else if (address > ske->ske_obj) - node = node->rb_right; - else - return (ske); - } - - return (NULL); -} - -static int -spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) -{ - struct rb_node **new = &(root->rb_node), *parent = NULL; - spl_kmem_emergency_t *ske_tmp; - unsigned long address = ske->ske_obj; - - while (*new) { - ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); - - parent = *new; - if (address < ske_tmp->ske_obj) - new = &((*new)->rb_left); - else if (address > ske_tmp->ske_obj) - new = &((*new)->rb_right); - else - return (0); - } - - rb_link_node(&ske->ske_node, parent, new); - rb_insert_color(&ske->ske_node, root); - - return (1); -} - -/* - * Allocate a single emergency object and track it in a red black tree. - */ -static int -spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) -{ - gfp_t lflags = kmem_flags_convert(flags); - spl_kmem_emergency_t *ske; - int order = get_order(skc->skc_obj_size); - int empty; - - /* Last chance use a partial slab if one now exists */ - spin_lock(&skc->skc_lock); - empty = list_empty(&skc->skc_partial_list); - spin_unlock(&skc->skc_lock); - if (!empty) - return (-EEXIST); - - ske = kmalloc(sizeof (*ske), lflags); - if (ske == NULL) - return (-ENOMEM); - - ske->ske_obj = __get_free_pages(lflags, order); - if (ske->ske_obj == 0) { - kfree(ske); - return (-ENOMEM); - } - - spin_lock(&skc->skc_lock); - empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); - if (likely(empty)) { - skc->skc_obj_total++; - skc->skc_obj_emergency++; - if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) - skc->skc_obj_emergency_max = skc->skc_obj_emergency; - } - spin_unlock(&skc->skc_lock); - - if (unlikely(!empty)) { - free_pages(ske->ske_obj, order); - kfree(ske); - return (-EINVAL); - } - - *obj = (void *)ske->ske_obj; - - return (0); -} - -/* - * Locate the passed object in the red black tree and free it. - */ -static int -spl_emergency_free(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_emergency_t *ske; - int order = get_order(skc->skc_obj_size); - - spin_lock(&skc->skc_lock); - ske = spl_emergency_search(&skc->skc_emergency_tree, obj); - if (ske) { - rb_erase(&ske->ske_node, &skc->skc_emergency_tree); - skc->skc_obj_emergency--; - skc->skc_obj_total--; - } - spin_unlock(&skc->skc_lock); - - if (ske == NULL) - return (-ENOENT); - - free_pages(ske->ske_obj, order); - kfree(ske); - - return (0); -} - -/* - * Release objects from the per-cpu magazine back to their slab. The flush - * argument contains the max number of entries to remove from the magazine. - */ -static void -__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - int i, count = MIN(flush, skm->skm_avail); - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skm->skm_magic == SKM_MAGIC); - - for (i = 0; i < count; i++) - spl_cache_shrink(skc, skm->skm_objs[i]); - - skm->skm_avail -= count; - memmove(skm->skm_objs, &(skm->skm_objs[count]), - sizeof (void *) * skm->skm_avail); -} - -static void -spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) -{ - spin_lock(&skc->skc_lock); - __spl_cache_flush(skc, skm, flush); - spin_unlock(&skc->skc_lock); -} - -static void -spl_magazine_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; - - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_cpu == smp_processor_id()); - ASSERT(irqs_disabled()); - - /* There are no available objects or they are too young to age out */ - if ((skm->skm_avail == 0) || - time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) - return; - - /* - * Because we're executing in interrupt context we may have - * interrupted the holder of this lock. To avoid a potential - * deadlock return if the lock is contended. - */ - if (!spin_trylock(&skc->skc_lock)) - return; - - __spl_cache_flush(skc, skm, skm->skm_refill); - spin_unlock(&skc->skc_lock); -} - -/* - * Called regularly to keep a downward pressure on the cache. - * - * Objects older than skc->skc_delay seconds in the per-cpu magazines will - * be returned to the caches. This is done to prevent idle magazines from - * holding memory which could be better used elsewhere. The delay is - * present to prevent thrashing the magazine. - * - * The newly released objects may result in empty partial slabs. Those - * slabs should be released to the system. Otherwise moving the objects - * out of the magazines is just wasted work. - */ -static void -spl_cache_age(void *data) -{ - spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; - taskqid_t id = 0; - - ASSERT(skc->skc_magic == SKC_MAGIC); - - /* Dynamically disabled at run time */ - if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) - return; - - atomic_inc(&skc->skc_ref); - - if (!(skc->skc_flags & KMC_NOMAGAZINE)) - on_each_cpu(spl_magazine_age, skc, 1); - - spl_slab_reclaim(skc); - - while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { - id = taskq_dispatch_delay( - spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - - /* Destroy issued after dispatch immediately cancel it */ - if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) - taskq_cancel_id(spl_kmem_cache_taskq, id); - } - - spin_lock(&skc->skc_lock); - skc->skc_taskqid = id; - spin_unlock(&skc->skc_lock); - - atomic_dec(&skc->skc_ref); -} - -/* - * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. - * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, - * for very small objects we may end up with more than this so as not - * to waste space in the minimal allocation of a single page. Also for - * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, - * lower than this and we will fail. - */ -static int -spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) -{ - uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; - - if (skc->skc_flags & KMC_OFFSLAB) { - tgt_objs = spl_kmem_cache_obj_per_slab; - tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); - - if ((skc->skc_flags & KMC_KMEM) && - (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) - return (-ENOSPC); - } else { - sks_size = spl_sks_size(skc); - obj_size = spl_obj_size(skc); - max_size = (spl_kmem_cache_max_size * 1024 * 1024); - tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); - - /* - * KMC_KMEM slabs are allocated by __get_free_pages() which - * rounds up to the nearest order. Knowing this the size - * should be rounded up to the next power of two with a hard - * maximum defined by the maximum allowed allocation order. - */ - if (skc->skc_flags & KMC_KMEM) { - max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; - tgt_size = MIN(max_size, - PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); - } - - if (tgt_size <= max_size) { - tgt_objs = (tgt_size - sks_size) / obj_size; - } else { - tgt_objs = (max_size - sks_size) / obj_size; - tgt_size = (tgt_objs * obj_size) + sks_size; - } - } - - if (tgt_objs == 0) - return (-ENOSPC); - - *objs = tgt_objs; - *size = tgt_size; - - return (0); -} - -/* - * Make a guess at reasonable per-cpu magazine size based on the size of - * each object and the cost of caching N of them in each magazine. Long - * term this should really adapt based on an observed usage heuristic. - */ -static int -spl_magazine_size(spl_kmem_cache_t *skc) -{ - uint32_t obj_size = spl_obj_size(skc); - int size; - - if (spl_kmem_cache_magazine_size > 0) - return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); - - /* Per-magazine sizes below assume a 4Kib page size */ - if (obj_size > (PAGE_SIZE * 256)) - size = 4; /* Minimum 4Mib per-magazine */ - else if (obj_size > (PAGE_SIZE * 32)) - size = 16; /* Minimum 2Mib per-magazine */ - else if (obj_size > (PAGE_SIZE)) - size = 64; /* Minimum 256Kib per-magazine */ - else if (obj_size > (PAGE_SIZE / 4)) - size = 128; /* Minimum 128Kib per-magazine */ - else - size = 256; - - return (size); -} - -/* - * Allocate a per-cpu magazine to associate with a specific core. - */ -static spl_kmem_magazine_t * -spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) -{ - spl_kmem_magazine_t *skm; - int size = sizeof (spl_kmem_magazine_t) + - sizeof (void *) * skc->skc_mag_size; - - skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); - if (skm) { - skm->skm_magic = SKM_MAGIC; - skm->skm_avail = 0; - skm->skm_size = skc->skc_mag_size; - skm->skm_refill = skc->skc_mag_refill; - skm->skm_cache = skc; - skm->skm_age = jiffies; - skm->skm_cpu = cpu; - } - - return (skm); -} - -/* - * Free a per-cpu magazine associated with a specific core. - */ -static void -spl_magazine_free(spl_kmem_magazine_t *skm) -{ - ASSERT(skm->skm_magic == SKM_MAGIC); - ASSERT(skm->skm_avail == 0); - kfree(skm); -} - -/* - * Create all pre-cpu magazines of reasonable sizes. - */ -static int -spl_magazine_create(spl_kmem_cache_t *skc) -{ - int i; - - if (skc->skc_flags & KMC_NOMAGAZINE) - return (0); - - skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * - num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); - skc->skc_mag_size = spl_magazine_size(skc); - skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; - - for_each_possible_cpu(i) { - skc->skc_mag[i] = spl_magazine_alloc(skc, i); - if (!skc->skc_mag[i]) { - for (i--; i >= 0; i--) - spl_magazine_free(skc->skc_mag[i]); - - kfree(skc->skc_mag); - return (-ENOMEM); - } - } - - return (0); -} - -/* - * Destroy all pre-cpu magazines. - */ -static void -spl_magazine_destroy(spl_kmem_cache_t *skc) -{ - spl_kmem_magazine_t *skm; - int i; - - if (skc->skc_flags & KMC_NOMAGAZINE) - return; - - for_each_possible_cpu(i) { - skm = skc->skc_mag[i]; - spl_cache_flush(skc, skm, skm->skm_avail); - spl_magazine_free(skm); - } - - kfree(skc->skc_mag); -} - -/* - * Create a object cache based on the following arguments: - * name cache name - * size cache object size - * align cache object alignment - * ctor cache object constructor - * dtor cache object destructor - * reclaim cache object reclaim - * priv cache private data for ctor/dtor/reclaim - * vmp unused must be NULL - * flags - * KMC_KMEM Force SPL kmem backed cache - * KMC_VMEM Force SPL vmem backed cache - * KMC_SLAB Force Linux slab backed cache - * KMC_OFFSLAB Locate objects off the slab - * KMC_NOTOUCH unsupported - * KMC_NODEBUG unsupported - * KMC_NOHASH unsupported - * KMC_QCACHE unsupported - * KMC_NOMAGAZINE unsupported - */ -spl_kmem_cache_t * -spl_kmem_cache_create(char *name, size_t size, size_t align, - spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, - void *priv, void *vmp, int flags) -{ - gfp_t lflags = kmem_flags_convert(KM_SLEEP); - spl_kmem_cache_t *skc; - int rc; - - /* - * Unsupported flags - */ - ASSERT0(flags & KMC_NOMAGAZINE); - ASSERT0(flags & KMC_NOHASH); - ASSERT0(flags & KMC_QCACHE); - ASSERT(vmp == NULL); - - might_sleep(); - - skc = kzalloc(sizeof (*skc), lflags); - if (skc == NULL) - return (NULL); - - skc->skc_magic = SKC_MAGIC; - skc->skc_name_size = strlen(name) + 1; - skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); - if (skc->skc_name == NULL) { - kfree(skc); - return (NULL); - } - strncpy(skc->skc_name, name, skc->skc_name_size); - - skc->skc_ctor = ctor; - skc->skc_dtor = dtor; - skc->skc_reclaim = reclaim; - skc->skc_private = priv; - skc->skc_vmp = vmp; - skc->skc_linux_cache = NULL; - skc->skc_flags = flags; - skc->skc_obj_size = size; - skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; - skc->skc_delay = SPL_KMEM_CACHE_DELAY; - skc->skc_reap = SPL_KMEM_CACHE_REAP; - atomic_set(&skc->skc_ref, 0); - - INIT_LIST_HEAD(&skc->skc_list); - INIT_LIST_HEAD(&skc->skc_complete_list); - INIT_LIST_HEAD(&skc->skc_partial_list); - skc->skc_emergency_tree = RB_ROOT; - spin_lock_init(&skc->skc_lock); - init_waitqueue_head(&skc->skc_waitq); - skc->skc_slab_fail = 0; - skc->skc_slab_create = 0; - skc->skc_slab_destroy = 0; - skc->skc_slab_total = 0; - skc->skc_slab_alloc = 0; - skc->skc_slab_max = 0; - skc->skc_obj_total = 0; - skc->skc_obj_alloc = 0; - skc->skc_obj_max = 0; - skc->skc_obj_deadlock = 0; - skc->skc_obj_emergency = 0; - skc->skc_obj_emergency_max = 0; - - /* - * Verify the requested alignment restriction is sane. - */ - if (align) { - VERIFY(ISP2(align)); - VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); - VERIFY3U(align, <=, PAGE_SIZE); - skc->skc_obj_align = align; - } - - /* - * When no specific type of slab is requested (kmem, vmem, or - * linuxslab) then select a cache type based on the object size - * and default tunables. - */ - if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { - - if (spl_kmem_cache_slab_limit && - size <= (size_t)spl_kmem_cache_slab_limit) { - /* - * Objects smaller than spl_kmem_cache_slab_limit can - * use the Linux slab for better space-efficiency. - */ - skc->skc_flags |= KMC_SLAB; - } else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) { - /* - * Small objects, less than spl_kmem_cache_kmem_limit - * per object should use kmem because their slabs are - * small. - */ - skc->skc_flags |= KMC_KMEM; - } else { - /* - * All other objects are considered large and are - * placed on vmem backed slabs. - */ - skc->skc_flags |= KMC_VMEM; - } - } - - /* - * Given the type of slab allocate the required resources. - */ - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { - rc = spl_slab_size(skc, - &skc->skc_slab_objs, &skc->skc_slab_size); - if (rc) - goto out; - - rc = spl_magazine_create(skc); - if (rc) - goto out; - } else { - unsigned long slabflags = 0; - - if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { - rc = EINVAL; - goto out; - } - -#if defined(SLAB_USERCOPY) - /* - * Required for PAX-enabled kernels if the slab is to be - * used for copying between user and kernel space. - */ - slabflags |= SLAB_USERCOPY; -#endif - -#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) - /* - * Newer grsec patchset uses kmem_cache_create_usercopy() - * instead of SLAB_USERCOPY flag - */ - skc->skc_linux_cache = kmem_cache_create_usercopy( - skc->skc_name, size, align, slabflags, 0, size, NULL); -#else - skc->skc_linux_cache = kmem_cache_create( - skc->skc_name, size, align, slabflags, NULL); -#endif - if (skc->skc_linux_cache == NULL) { - rc = ENOMEM; - goto out; - } - -#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) - skc->skc_linux_cache->allocflags |= __GFP_COMP; -#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) - skc->skc_linux_cache->gfpflags |= __GFP_COMP; -#endif - skc->skc_flags |= KMC_NOMAGAZINE; - } - - if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) { - skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, - spl_cache_age, skc, TQ_SLEEP, - ddi_get_lbolt() + skc->skc_delay / 3 * HZ); - } - - down_write(&spl_kmem_cache_sem); - list_add_tail(&skc->skc_list, &spl_kmem_cache_list); - up_write(&spl_kmem_cache_sem); - - return (skc); -out: - kfree(skc->skc_name); - kfree(skc); - return (NULL); -} -EXPORT_SYMBOL(spl_kmem_cache_create); - -/* - * Register a move callback for cache defragmentation. - * XXX: Unimplemented but harmless to stub out for now. - */ -void -spl_kmem_cache_set_move(spl_kmem_cache_t *skc, - kmem_cbrc_t (move)(void *, void *, size_t, void *)) -{ - ASSERT(move != NULL); -} -EXPORT_SYMBOL(spl_kmem_cache_set_move); - -/* - * Destroy a cache and all objects associated with the cache. - */ -void -spl_kmem_cache_destroy(spl_kmem_cache_t *skc) -{ - DECLARE_WAIT_QUEUE_HEAD(wq); - taskqid_t id; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); - - down_write(&spl_kmem_cache_sem); - list_del_init(&skc->skc_list); - up_write(&spl_kmem_cache_sem); - - /* Cancel any and wait for any pending delayed tasks */ - VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - spin_lock(&skc->skc_lock); - id = skc->skc_taskqid; - spin_unlock(&skc->skc_lock); - - taskq_cancel_id(spl_kmem_cache_taskq, id); - - /* - * Wait until all current callers complete, this is mainly - * to catch the case where a low memory situation triggers a - * cache reaping action which races with this destroy. - */ - wait_event(wq, atomic_read(&skc->skc_ref) == 0); - - if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { - spl_magazine_destroy(skc); - spl_slab_reclaim(skc); - } else { - ASSERT(skc->skc_flags & KMC_SLAB); - kmem_cache_destroy(skc->skc_linux_cache); - } - - spin_lock(&skc->skc_lock); - - /* - * Validate there are no objects in use and free all the - * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. - */ - ASSERT3U(skc->skc_slab_alloc, ==, 0); - ASSERT3U(skc->skc_obj_alloc, ==, 0); - ASSERT3U(skc->skc_slab_total, ==, 0); - ASSERT3U(skc->skc_obj_total, ==, 0); - ASSERT3U(skc->skc_obj_emergency, ==, 0); - ASSERT(list_empty(&skc->skc_complete_list)); - - spin_unlock(&skc->skc_lock); - - kfree(skc->skc_name); - kfree(skc); -} -EXPORT_SYMBOL(spl_kmem_cache_destroy); - -/* - * Allocate an object from a slab attached to the cache. This is used to - * repopulate the per-cpu magazine caches in batches when they run low. - */ -static void * -spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) -{ - spl_kmem_obj_t *sko; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(sks->sks_magic == SKS_MAGIC); - - sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); - ASSERT(sko->sko_magic == SKO_MAGIC); - ASSERT(sko->sko_addr != NULL); - - /* Remove from sks_free_list */ - list_del_init(&sko->sko_list); - - sks->sks_age = jiffies; - sks->sks_ref++; - skc->skc_obj_alloc++; - - /* Track max obj usage statistics */ - if (skc->skc_obj_alloc > skc->skc_obj_max) - skc->skc_obj_max = skc->skc_obj_alloc; - - /* Track max slab usage statistics */ - if (sks->sks_ref == 1) { - skc->skc_slab_alloc++; - - if (skc->skc_slab_alloc > skc->skc_slab_max) - skc->skc_slab_max = skc->skc_slab_alloc; - } - - return (sko->sko_addr); -} - -/* - * Generic slab allocation function to run by the global work queues. - * It is responsible for allocating a new slab, linking it in to the list - * of partial slabs, and then waking any waiters. - */ -static int -__spl_cache_grow(spl_kmem_cache_t *skc, int flags) -{ - spl_kmem_slab_t *sks; - - fstrans_cookie_t cookie = spl_fstrans_mark(); - sks = spl_slab_alloc(skc, flags); - spl_fstrans_unmark(cookie); - - spin_lock(&skc->skc_lock); - if (sks) { - skc->skc_slab_total++; - skc->skc_obj_total += sks->sks_objs; - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - - smp_mb__before_atomic(); - clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); - smp_mb__after_atomic(); - wake_up_all(&skc->skc_waitq); - } - spin_unlock(&skc->skc_lock); - - return (sks == NULL ? -ENOMEM : 0); -} - -static void -spl_cache_grow_work(void *data) -{ - spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; - spl_kmem_cache_t *skc = ska->ska_cache; - - (void) __spl_cache_grow(skc, ska->ska_flags); - - atomic_dec(&skc->skc_ref); - smp_mb__before_atomic(); - clear_bit(KMC_BIT_GROWING, &skc->skc_flags); - smp_mb__after_atomic(); - - kfree(ska); -} - -/* - * Returns non-zero when a new slab should be available. - */ -static int -spl_cache_grow_wait(spl_kmem_cache_t *skc) -{ - return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); -} - -/* - * No available objects on any slabs, create a new slab. Note that this - * functionality is disabled for KMC_SLAB caches which are backed by the - * Linux slab. - */ -static int -spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) -{ - int remaining, rc = 0; - - ASSERT0(flags & ~KM_PUBLIC_MASK); - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); - *obj = NULL; - - /* - * Before allocating a new slab wait for any reaping to complete and - * then return so the local magazine can be rechecked for new objects. - */ - if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { - rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, - TASK_UNINTERRUPTIBLE); - return (rc ? rc : -EAGAIN); - } - - /* - * To reduce the overhead of context switch and improve NUMA locality, - * it tries to allocate a new slab in the current process context with - * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the - * allocation. - * - * However, this can't be applied to KVM_VMEM due to a bug that - * __vmalloc() doesn't honor gfp flags in page table allocation. - */ - if (!(skc->skc_flags & KMC_VMEM)) { - rc = __spl_cache_grow(skc, flags | KM_NOSLEEP); - if (rc == 0) - return (0); - } - - /* - * This is handled by dispatching a work request to the global work - * queue. This allows us to asynchronously allocate a new slab while - * retaining the ability to safely fall back to a smaller synchronous - * allocations to ensure forward progress is always maintained. - */ - if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { - spl_kmem_alloc_t *ska; - - ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); - if (ska == NULL) { - clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); - smp_mb__after_atomic(); - wake_up_all(&skc->skc_waitq); - return (-ENOMEM); - } - - atomic_inc(&skc->skc_ref); - ska->ska_cache = skc; - ska->ska_flags = flags; - taskq_init_ent(&ska->ska_tqe); - taskq_dispatch_ent(spl_kmem_cache_taskq, - spl_cache_grow_work, ska, 0, &ska->ska_tqe); - } - - /* - * The goal here is to only detect the rare case where a virtual slab - * allocation has deadlocked. We must be careful to minimize the use - * of emergency objects which are more expensive to track. Therefore, - * we set a very long timeout for the asynchronous allocation and if - * the timeout is reached the cache is flagged as deadlocked. From - * this point only new emergency objects will be allocated until the - * asynchronous allocation completes and clears the deadlocked flag. - */ - if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { - rc = spl_emergency_alloc(skc, flags, obj); - } else { - remaining = wait_event_timeout(skc->skc_waitq, - spl_cache_grow_wait(skc), HZ / 10); - - if (!remaining) { - spin_lock(&skc->skc_lock); - if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { - set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); - skc->skc_obj_deadlock++; - } - spin_unlock(&skc->skc_lock); - } - - rc = -ENOMEM; - } - - return (rc); -} - -/* - * Refill a per-cpu magazine with objects from the slabs for this cache. - * Ideally the magazine can be repopulated using existing objects which have - * been released, however if we are unable to locate enough free objects new - * slabs of objects will be created. On success NULL is returned, otherwise - * the address of a single emergency object is returned for use by the caller. - */ -static void * -spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) -{ - spl_kmem_slab_t *sks; - int count = 0, rc, refill; - void *obj = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(skm->skm_magic == SKM_MAGIC); - - refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); - spin_lock(&skc->skc_lock); - - while (refill > 0) { - /* No slabs available we may need to grow the cache */ - if (list_empty(&skc->skc_partial_list)) { - spin_unlock(&skc->skc_lock); - - local_irq_enable(); - rc = spl_cache_grow(skc, flags, &obj); - local_irq_disable(); - - /* Emergency object for immediate use by caller */ - if (rc == 0 && obj != NULL) - return (obj); - - if (rc) - goto out; - - /* Rescheduled to different CPU skm is not local */ - if (skm != skc->skc_mag[smp_processor_id()]) - goto out; - - /* - * Potentially rescheduled to the same CPU but - * allocations may have occurred from this CPU while - * we were sleeping so recalculate max refill. - */ - refill = MIN(refill, skm->skm_size - skm->skm_avail); - - spin_lock(&skc->skc_lock); - continue; - } - - /* Grab the next available slab */ - sks = list_entry((&skc->skc_partial_list)->next, - spl_kmem_slab_t, sks_list); - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_ref < sks->sks_objs); - ASSERT(!list_empty(&sks->sks_free_list)); - - /* - * Consume as many objects as needed to refill the requested - * cache. We must also be careful not to overfill it. - */ - while (sks->sks_ref < sks->sks_objs && refill-- > 0 && - ++count) { - ASSERT(skm->skm_avail < skm->skm_size); - ASSERT(count < skm->skm_size); - skm->skm_objs[skm->skm_avail++] = - spl_cache_obj(skc, sks); - } - - /* Move slab to skc_complete_list when full */ - if (sks->sks_ref == sks->sks_objs) { - list_del(&sks->sks_list); - list_add(&sks->sks_list, &skc->skc_complete_list); - } - } - - spin_unlock(&skc->skc_lock); -out: - return (NULL); -} - -/* - * Release an object back to the slab from which it came. - */ -static void -spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_slab_t *sks = NULL; - spl_kmem_obj_t *sko = NULL; - - ASSERT(skc->skc_magic == SKC_MAGIC); - - sko = spl_sko_from_obj(skc, obj); - ASSERT(sko->sko_magic == SKO_MAGIC); - sks = sko->sko_slab; - ASSERT(sks->sks_magic == SKS_MAGIC); - ASSERT(sks->sks_cache == skc); - list_add(&sko->sko_list, &sks->sks_free_list); - - sks->sks_age = jiffies; - sks->sks_ref--; - skc->skc_obj_alloc--; - - /* - * Move slab to skc_partial_list when no longer full. Slabs - * are added to the head to keep the partial list is quasi-full - * sorted order. Fuller at the head, emptier at the tail. - */ - if (sks->sks_ref == (sks->sks_objs - 1)) { - list_del(&sks->sks_list); - list_add(&sks->sks_list, &skc->skc_partial_list); - } - - /* - * Move empty slabs to the end of the partial list so - * they can be easily found and freed during reclamation. - */ - if (sks->sks_ref == 0) { - list_del(&sks->sks_list); - list_add_tail(&sks->sks_list, &skc->skc_partial_list); - skc->skc_slab_alloc--; - } -} - -/* - * Allocate an object from the per-cpu magazine, or if the magazine - * is empty directly allocate from a slab and repopulate the magazine. - */ -void * -spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) -{ - spl_kmem_magazine_t *skm; - void *obj = NULL; - - ASSERT0(flags & ~KM_PUBLIC_MASK); - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - /* - * Allocate directly from a Linux slab. All optimizations are left - * to the underlying cache we only need to guarantee that KM_SLEEP - * callers will never fail. - */ - if (skc->skc_flags & KMC_SLAB) { - struct kmem_cache *slc = skc->skc_linux_cache; - do { - obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); - } while ((obj == NULL) && !(flags & KM_NOSLEEP)); - - goto ret; - } - - local_irq_disable(); - -restart: - /* - * Safe to update per-cpu structure without lock, but - * in the restart case we must be careful to reacquire - * the local magazine since this may have changed - * when we need to grow the cache. - */ - skm = skc->skc_mag[smp_processor_id()]; - ASSERT(skm->skm_magic == SKM_MAGIC); - - if (likely(skm->skm_avail)) { - /* Object available in CPU cache, use it */ - obj = skm->skm_objs[--skm->skm_avail]; - skm->skm_age = jiffies; - } else { - obj = spl_cache_refill(skc, skm, flags); - if ((obj == NULL) && !(flags & KM_NOSLEEP)) - goto restart; - - local_irq_enable(); - goto ret; - } - - local_irq_enable(); - ASSERT(obj); - ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); - -ret: - /* Pre-emptively migrate object to CPU L1 cache */ - if (obj) { - if (obj && skc->skc_ctor) - skc->skc_ctor(obj, skc->skc_private, flags); - else - prefetchw(obj); - } - - return (obj); -} -EXPORT_SYMBOL(spl_kmem_cache_alloc); - -/* - * Free an object back to the local per-cpu magazine, there is no - * guarantee that this is the same magazine the object was originally - * allocated from. We may need to flush entire from the magazine - * back to the slabs to make space. - */ -void -spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) -{ - spl_kmem_magazine_t *skm; - unsigned long flags; - int do_reclaim = 0; - int do_emergency = 0; - - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - /* - * Run the destructor - */ - if (skc->skc_dtor) - skc->skc_dtor(obj, skc->skc_private); - - /* - * Free the object from the Linux underlying Linux slab. - */ - if (skc->skc_flags & KMC_SLAB) { - kmem_cache_free(skc->skc_linux_cache, obj); - return; - } - - /* - * While a cache has outstanding emergency objects all freed objects - * must be checked. However, since emergency objects will never use - * a virtual address these objects can be safely excluded as an - * optimization. - */ - if (!is_vmalloc_addr(obj)) { - spin_lock(&skc->skc_lock); - do_emergency = (skc->skc_obj_emergency > 0); - spin_unlock(&skc->skc_lock); - - if (do_emergency && (spl_emergency_free(skc, obj) == 0)) - return; - } - - local_irq_save(flags); - - /* - * Safe to update per-cpu structure without lock, but - * no remote memory allocation tracking is being performed - * it is entirely possible to allocate an object from one - * CPU cache and return it to another. - */ - skm = skc->skc_mag[smp_processor_id()]; - ASSERT(skm->skm_magic == SKM_MAGIC); - - /* - * Per-CPU cache full, flush it to make space for this object, - * this may result in an empty slab which can be reclaimed once - * interrupts are re-enabled. - */ - if (unlikely(skm->skm_avail >= skm->skm_size)) { - spl_cache_flush(skc, skm, skm->skm_refill); - do_reclaim = 1; - } - - /* Available space in cache, use it */ - skm->skm_objs[skm->skm_avail++] = obj; - - local_irq_restore(flags); - - if (do_reclaim) - spl_slab_reclaim(skc); -} -EXPORT_SYMBOL(spl_kmem_cache_free); - -/* - * The generic shrinker function for all caches. Under Linux a shrinker - * may not be tightly coupled with a slab cache. In fact Linux always - * systematically tries calling all registered shrinker callbacks which - * report that they contain unused objects. Because of this we only - * register one shrinker function in the shim layer for all slab caches. - * We always attempt to shrink all caches when this generic shrinker - * is called. - * - * If sc->nr_to_scan is zero, the caller is requesting a query of the - * number of objects which can potentially be freed. If it is nonzero, - * the request is to free that many objects. - * - * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks - * in struct shrinker and also require the shrinker to return the number - * of objects freed. - * - * Older kernels require the shrinker to return the number of freeable - * objects following the freeing of nr_to_free. - * - * Linux semantics differ from those under Solaris, which are to - * free all available objects which may (and probably will) be more - * objects than the requested nr_to_scan. - */ -static spl_shrinker_t -__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, - struct shrink_control *sc) -{ - spl_kmem_cache_t *skc; - int alloc = 0; - - /* - * No shrinking in a transaction context. Can cause deadlocks. - */ - if (sc->nr_to_scan && spl_fstrans_check()) - return (SHRINK_STOP); - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - if (sc->nr_to_scan) { -#ifdef HAVE_SPLIT_SHRINKER_CALLBACK - uint64_t oldalloc = skc->skc_obj_alloc; - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); - if (oldalloc > skc->skc_obj_alloc) - alloc += oldalloc - skc->skc_obj_alloc; -#else - spl_kmem_cache_reap_now(skc, - MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); - alloc += skc->skc_obj_alloc; -#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ - } else { - /* Request to query number of freeable objects */ - alloc += skc->skc_obj_alloc; - } - } - up_read(&spl_kmem_cache_sem); - - /* - * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. - * This functionality only exists to work around a rare issue where - * shrink_slabs() is repeatedly invoked by many cores causing the - * system to thrash. - */ - if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) - return (SHRINK_STOP); - - return (MAX(alloc, 0)); -} - -SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); - -/* - * Call the registered reclaim function for a cache. Depending on how - * many and which objects are released it may simply repopulate the - * local magazine which will then need to age-out. Objects which cannot - * fit in the magazine we will be released back to their slabs which will - * also need to age out before being release. This is all just best - * effort and we do not want to thrash creating and destroying slabs. - */ -void -spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) -{ - ASSERT(skc->skc_magic == SKC_MAGIC); - ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); - - atomic_inc(&skc->skc_ref); - - /* - * Execute the registered reclaim callback if it exists. - */ - if (skc->skc_flags & KMC_SLAB) { - if (skc->skc_reclaim) - skc->skc_reclaim(skc->skc_private); - goto out; - } - - /* - * Prevent concurrent cache reaping when contended. - */ - if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) - goto out; - - /* - * When a reclaim function is available it may be invoked repeatedly - * until at least a single slab can be freed. This ensures that we - * do free memory back to the system. This helps minimize the chance - * of an OOM event when the bulk of memory is used by the slab. - * - * When free slabs are already available the reclaim callback will be - * skipped. Additionally, if no forward progress is detected despite - * a reclaim function the cache will be skipped to avoid deadlock. - * - * Longer term this would be the correct place to add the code which - * repacks the slabs in order minimize fragmentation. - */ - if (skc->skc_reclaim) { - uint64_t objects = UINT64_MAX; - int do_reclaim; - - do { - spin_lock(&skc->skc_lock); - do_reclaim = - (skc->skc_slab_total > 0) && - ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && - (skc->skc_obj_alloc < objects); - - objects = skc->skc_obj_alloc; - spin_unlock(&skc->skc_lock); - - if (do_reclaim) - skc->skc_reclaim(skc->skc_private); - - } while (do_reclaim); - } - - /* Reclaim from the magazine and free all now empty slabs. */ - if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { - spl_kmem_magazine_t *skm; - unsigned long irq_flags; - - local_irq_save(irq_flags); - skm = skc->skc_mag[smp_processor_id()]; - spl_cache_flush(skc, skm, skm->skm_avail); - local_irq_restore(irq_flags); - } - - spl_slab_reclaim(skc); - clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); - smp_mb__after_atomic(); - wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); -out: - atomic_dec(&skc->skc_ref); -} -EXPORT_SYMBOL(spl_kmem_cache_reap_now); - -/* - * This is stubbed out for code consistency with other platforms. There - * is existing logic to prevent concurrent reaping so while this is ugly - * it should do no harm. - */ -int -spl_kmem_cache_reap_active() -{ - return (0); -} -EXPORT_SYMBOL(spl_kmem_cache_reap_active); - -/* - * Reap all free slabs from all registered caches. - */ -void -spl_kmem_reap(void) -{ - struct shrink_control sc; - - sc.nr_to_scan = KMC_REAP_CHUNK; - sc.gfp_mask = GFP_KERNEL; - - (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); -} -EXPORT_SYMBOL(spl_kmem_reap); - -int -spl_kmem_cache_init(void) -{ - init_rwsem(&spl_kmem_cache_sem); - INIT_LIST_HEAD(&spl_kmem_cache_list); - spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", - spl_kmem_cache_kmem_threads, maxclsyspri, - spl_kmem_cache_kmem_threads * 8, INT_MAX, - TASKQ_PREPOPULATE | TASKQ_DYNAMIC); - spl_register_shrinker(&spl_kmem_cache_shrinker); - - return (0); -} - -void -spl_kmem_cache_fini(void) -{ - spl_unregister_shrinker(&spl_kmem_cache_shrinker); - taskq_destroy(spl_kmem_cache_taskq); -} diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c deleted file mode 100644 index 824b5e89f..000000000 --- a/module/spl/spl-kmem.c +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <sys/debug.h> -#include <sys/sysmacros.h> -#include <sys/kmem.h> -#include <sys/vmem.h> -#include <linux/mm.h> - -/* - * As a general rule kmem_alloc() allocations should be small, preferably - * just a few pages since they must by physically contiguous. Therefore, a - * rate limited warning will be printed to the console for any kmem_alloc() - * which exceeds a reasonable threshold. - * - * The default warning threshold is set to sixteen pages but capped at 64K to - * accommodate systems using large pages. This value was selected to be small - * enough to ensure the largest allocations are quickly noticed and fixed. - * But large enough to avoid logging any warnings when a allocation size is - * larger than optimal but not a serious concern. Since this value is tunable, - * developers are encouraged to set it lower when testing so any new largish - * allocations are quickly caught. These warnings may be disabled by setting - * the threshold to zero. - */ -/* BEGIN CSTYLED */ -unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); -module_param(spl_kmem_alloc_warn, uint, 0644); -MODULE_PARM_DESC(spl_kmem_alloc_warn, - "Warning threshold in bytes for a kmem_alloc()"); -EXPORT_SYMBOL(spl_kmem_alloc_warn); - -/* - * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. - * Allocations which are marginally smaller than this limit may succeed but - * should still be avoided due to the expense of locating a contiguous range - * of free pages. Therefore, a maximum kmem size with reasonable safely - * margin of 4x is set. Kmem_alloc() allocations larger than this maximum - * will quickly fail. Vmem_alloc() allocations less than or equal to this - * value will use kmalloc(), but shift to vmalloc() when exceeding this value. - */ -unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); -module_param(spl_kmem_alloc_max, uint, 0644); -MODULE_PARM_DESC(spl_kmem_alloc_max, - "Maximum size in bytes for a kmem_alloc()"); -EXPORT_SYMBOL(spl_kmem_alloc_max); -/* END CSTYLED */ - -int -kmem_debugging(void) -{ - return (0); -} -EXPORT_SYMBOL(kmem_debugging); - -char * -kmem_vasprintf(const char *fmt, va_list ap) -{ - va_list aq; - char *ptr; - - do { - va_copy(aq, ap); - ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); - va_end(aq); - } while (ptr == NULL); - - return (ptr); -} -EXPORT_SYMBOL(kmem_vasprintf); - -char * -kmem_asprintf(const char *fmt, ...) -{ - va_list ap; - char *ptr; - - do { - va_start(ap, fmt); - ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); - va_end(ap); - } while (ptr == NULL); - - return (ptr); -} -EXPORT_SYMBOL(kmem_asprintf); - -static char * -__strdup(const char *str, int flags) -{ - char *ptr; - int n; - - n = strlen(str); - ptr = kmalloc(n + 1, kmem_flags_convert(flags)); - if (ptr) - memcpy(ptr, str, n + 1); - - return (ptr); -} - -char * -strdup(const char *str) -{ - return (__strdup(str, KM_SLEEP)); -} -EXPORT_SYMBOL(strdup); - -void -strfree(char *str) -{ - kfree(str); -} -EXPORT_SYMBOL(strfree); - -/* - * General purpose unified implementation of kmem_alloc(). It is an - * amalgamation of Linux and Illumos allocator design. It should never be - * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains - * relatively portable. Consumers may only access this function through - * wrappers that enforce the common flags to ensure portability. - */ -inline void * -spl_kmem_alloc_impl(size_t size, int flags, int node) -{ - gfp_t lflags = kmem_flags_convert(flags); - int use_vmem = 0; - void *ptr; - - /* - * Log abnormally large allocations and rate limit the console output. - * Allocations larger than spl_kmem_alloc_warn should be performed - * through the vmem_alloc()/vmem_zalloc() interfaces. - */ - if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && - !(flags & KM_VMEM)) { - printk(KERN_WARNING - "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" - "https://github.com/zfsonlinux/zfs/issues/new\n", - (unsigned long)size, flags); - dump_stack(); - } - - /* - * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used - * unlike kmem_alloc() with KM_SLEEP on Illumos. - */ - do { - /* - * Calling kmalloc_node() when the size >= spl_kmem_alloc_max - * is unsafe. This must fail for all for kmem_alloc() and - * kmem_zalloc() callers. - * - * For vmem_alloc() and vmem_zalloc() callers it is permissible - * to use __vmalloc(). However, in general use of __vmalloc() - * is strongly discouraged because a global lock must be - * acquired. Contention on this lock can significantly - * impact performance so frequently manipulating the virtual - * address space is strongly discouraged. - */ - if ((size > spl_kmem_alloc_max) || use_vmem) { - if (flags & KM_VMEM) { - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, - PAGE_KERNEL); - } else { - return (NULL); - } - } else { - ptr = kmalloc_node(size, lflags, node); - } - - if (likely(ptr) || (flags & KM_NOSLEEP)) - return (ptr); - - /* - * For vmem_alloc() and vmem_zalloc() callers retry immediately - * using __vmalloc() which is unlikely to fail. - */ - if ((flags & KM_VMEM) && (use_vmem == 0)) { - use_vmem = 1; - continue; - } - - /* - * Use cond_resched() instead of congestion_wait() to avoid - * deadlocking systems where there are no block devices. - */ - cond_resched(); - } while (1); - - return (NULL); -} - -inline void -spl_kmem_free_impl(const void *buf, size_t size) -{ - if (is_vmalloc_addr(buf)) - vfree(buf); - else - kfree(buf); -} - -/* - * Memory allocation and accounting for kmem_* * style allocations. When - * DEBUG_KMEM is enabled the total memory allocated will be tracked and - * any memory leaked will be reported during module unload. - * - * ./configure --enable-debug-kmem - */ -#ifdef DEBUG_KMEM - -/* Shim layer memory accounting */ -#ifdef HAVE_ATOMIC64_T -atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); -unsigned long long kmem_alloc_max = 0; -#else /* HAVE_ATOMIC64_T */ -atomic_t kmem_alloc_used = ATOMIC_INIT(0); -unsigned long long kmem_alloc_max = 0; -#endif /* HAVE_ATOMIC64_T */ - -EXPORT_SYMBOL(kmem_alloc_used); -EXPORT_SYMBOL(kmem_alloc_max); - -inline void * -spl_kmem_alloc_debug(size_t size, int flags, int node) -{ - void *ptr; - - ptr = spl_kmem_alloc_impl(size, flags, node); - if (ptr) { - kmem_alloc_used_add(size); - if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) - kmem_alloc_max = kmem_alloc_used_read(); - } - - return (ptr); -} - -inline void -spl_kmem_free_debug(const void *ptr, size_t size) -{ - kmem_alloc_used_sub(size); - spl_kmem_free_impl(ptr, size); -} - -/* - * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked - * but also the location of every alloc and free. When the SPL module is - * unloaded a list of all leaked addresses and where they were allocated - * will be dumped to the console. Enabling this feature has a significant - * impact on performance but it makes finding memory leaks straight forward. - * - * Not surprisingly with debugging enabled the xmem_locks are very highly - * contended particularly on xfree(). If we want to run with this detailed - * debugging enabled for anything other than debugging we need to minimize - * the contention by moving to a lock per xmem_table entry model. - * - * ./configure --enable-debug-kmem-tracking - */ -#ifdef DEBUG_KMEM_TRACKING - -#include <linux/hash.h> -#include <linux/ctype.h> - -#define KMEM_HASH_BITS 10 -#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) - -typedef struct kmem_debug { - struct hlist_node kd_hlist; /* Hash node linkage */ - struct list_head kd_list; /* List of all allocations */ - void *kd_addr; /* Allocation pointer */ - size_t kd_size; /* Allocation size */ - const char *kd_func; /* Allocation function */ - int kd_line; /* Allocation line */ -} kmem_debug_t; - -static spinlock_t kmem_lock; -static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; -static struct list_head kmem_list; - -static kmem_debug_t * -kmem_del_init(spinlock_t *lock, struct hlist_head *table, - int bits, const void *addr) -{ - struct hlist_head *head; - struct hlist_node *node; - struct kmem_debug *p; - unsigned long flags; - - spin_lock_irqsave(lock, flags); - - head = &table[hash_ptr((void *)addr, bits)]; - hlist_for_each(node, head) { - p = list_entry(node, struct kmem_debug, kd_hlist); - if (p->kd_addr == addr) { - hlist_del_init(&p->kd_hlist); - list_del_init(&p->kd_list); - spin_unlock_irqrestore(lock, flags); - return (p); - } - } - - spin_unlock_irqrestore(lock, flags); - - return (NULL); -} - -inline void * -spl_kmem_alloc_track(size_t size, int flags, - const char *func, int line, int node) -{ - void *ptr = NULL; - kmem_debug_t *dptr; - unsigned long irq_flags; - - dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); - if (dptr == NULL) - return (NULL); - - dptr->kd_func = __strdup(func, flags); - if (dptr->kd_func == NULL) { - kfree(dptr); - return (NULL); - } - - ptr = spl_kmem_alloc_debug(size, flags, node); - if (ptr == NULL) { - kfree(dptr->kd_func); - kfree(dptr); - return (NULL); - } - - INIT_HLIST_NODE(&dptr->kd_hlist); - INIT_LIST_HEAD(&dptr->kd_list); - - dptr->kd_addr = ptr; - dptr->kd_size = size; - dptr->kd_line = line; - - spin_lock_irqsave(&kmem_lock, irq_flags); - hlist_add_head(&dptr->kd_hlist, - &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); - list_add_tail(&dptr->kd_list, &kmem_list); - spin_unlock_irqrestore(&kmem_lock, irq_flags); - - return (ptr); -} - -inline void -spl_kmem_free_track(const void *ptr, size_t size) -{ - kmem_debug_t *dptr; - - /* Ignore NULL pointer since we haven't tracked it at all */ - if (ptr == NULL) - return; - - /* Must exist in hash due to kmem_alloc() */ - dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); - ASSERT3P(dptr, !=, NULL); - ASSERT3S(dptr->kd_size, ==, size); - - kfree(dptr->kd_func); - kfree(dptr); - - spl_kmem_free_debug(ptr, size); -} -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ - -/* - * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. - */ -void * -spl_kmem_alloc(size_t size, int flags, const char *func, int line) -{ - ASSERT0(flags & ~KM_PUBLIC_MASK); - -#if !defined(DEBUG_KMEM) - return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); -#elif !defined(DEBUG_KMEM_TRACKING) - return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); -#else - return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); -#endif -} -EXPORT_SYMBOL(spl_kmem_alloc); - -void * -spl_kmem_zalloc(size_t size, int flags, const char *func, int line) -{ - ASSERT0(flags & ~KM_PUBLIC_MASK); - - flags |= KM_ZERO; - -#if !defined(DEBUG_KMEM) - return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); -#elif !defined(DEBUG_KMEM_TRACKING) - return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); -#else - return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); -#endif -} -EXPORT_SYMBOL(spl_kmem_zalloc); - -void -spl_kmem_free(const void *buf, size_t size) -{ -#if !defined(DEBUG_KMEM) - return (spl_kmem_free_impl(buf, size)); -#elif !defined(DEBUG_KMEM_TRACKING) - return (spl_kmem_free_debug(buf, size)); -#else - return (spl_kmem_free_track(buf, size)); -#endif -} -EXPORT_SYMBOL(spl_kmem_free); - -#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) -static char * -spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) -{ - int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; - int i, flag = 1; - - ASSERT(str != NULL && len >= 17); - memset(str, 0, len); - - /* - * Check for a fully printable string, and while we are at - * it place the printable characters in the passed buffer. - */ - for (i = 0; i < size; i++) { - str[i] = ((char *)(kd->kd_addr))[i]; - if (isprint(str[i])) { - continue; - } else { - /* - * Minimum number of printable characters found - * to make it worthwhile to print this as ascii. - */ - if (i > min) - break; - - flag = 0; - break; - } - } - - if (!flag) { - sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", - *((uint8_t *)kd->kd_addr), - *((uint8_t *)kd->kd_addr + 2), - *((uint8_t *)kd->kd_addr + 4), - *((uint8_t *)kd->kd_addr + 6), - *((uint8_t *)kd->kd_addr + 8), - *((uint8_t *)kd->kd_addr + 10), - *((uint8_t *)kd->kd_addr + 12), - *((uint8_t *)kd->kd_addr + 14)); - } - - return (str); -} - -static int -spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) -{ - int i; - - spin_lock_init(lock); - INIT_LIST_HEAD(list); - - for (i = 0; i < size; i++) - INIT_HLIST_HEAD(&kmem_table[i]); - - return (0); -} - -static void -spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) -{ - unsigned long flags; - kmem_debug_t *kd; - char str[17]; - - spin_lock_irqsave(lock, flags); - if (!list_empty(list)) - printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", - "size", "data", "func", "line"); - - list_for_each_entry(kd, list, kd_list) { - printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, - (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), - kd->kd_func, kd->kd_line); - } - - spin_unlock_irqrestore(lock, flags); -} -#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ - -int -spl_kmem_init(void) -{ - -#ifdef DEBUG_KMEM - kmem_alloc_used_set(0); - - - -#ifdef DEBUG_KMEM_TRACKING - spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ - - return (0); -} - -void -spl_kmem_fini(void) -{ -#ifdef DEBUG_KMEM - /* - * Display all unreclaimed memory addresses, including the - * allocation size and the first few bytes of what's located - * at that address to aid in debugging. Performance is not - * a serious concern here since it is module unload time. - */ - if (kmem_alloc_used_read() != 0) - printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", - (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); - -#ifdef DEBUG_KMEM_TRACKING - spl_kmem_fini_tracking(&kmem_list, &kmem_lock); -#endif /* DEBUG_KMEM_TRACKING */ -#endif /* DEBUG_KMEM */ -} diff --git a/module/spl/spl-kobj.c b/module/spl/spl-kobj.c deleted file mode 100644 index 7019369bd..000000000 --- a/module/spl/spl-kobj.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Kobj Implementation. - */ - -#include <sys/kobj.h> - -struct _buf * -kobj_open_file(const char *name) -{ - struct _buf *file; - vnode_t *vp; - int rc; - - file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP)); - if (file == NULL) - return ((_buf_t *)-1UL); - - if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) { - kfree(file); - return ((_buf_t *)-1UL); - } - - file->vp = vp; - - return (file); -} /* kobj_open_file() */ -EXPORT_SYMBOL(kobj_open_file); - -void -kobj_close_file(struct _buf *file) -{ - VOP_CLOSE(file->vp, 0, 0, 0, 0, 0); - kfree(file); -} /* kobj_close_file() */ -EXPORT_SYMBOL(kobj_close_file); - -int -kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) -{ - ssize_t resid; - - if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off, - UIO_SYSSPACE, 0, 0, 0, &resid) != 0) - return (-1); - - return (size - resid); -} /* kobj_read_file() */ -EXPORT_SYMBOL(kobj_read_file); - -int -kobj_get_filesize(struct _buf *file, uint64_t *size) -{ - vattr_t vap; - int rc; - - rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL); - if (rc) - return (rc); - - *size = vap.va_size; - - return (rc); -} /* kobj_get_filesize() */ -EXPORT_SYMBOL(kobj_get_filesize); diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c deleted file mode 100644 index 1f67bf157..000000000 --- a/module/spl/spl-kstat.c +++ /dev/null @@ -1,770 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Kstat Implementation. - */ - -#include <linux/seq_file.h> -#include <sys/kstat.h> -#include <sys/vmem.h> -#include <sys/cmn_err.h> -#include <sys/sysmacros.h> - -static kmutex_t kstat_module_lock; -static struct list_head kstat_module_list; -static kid_t kstat_id; - -static int -kstat_resize_raw(kstat_t *ksp) -{ - if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) - return (ENOMEM); - - vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); - ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); - ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); - - return (0); -} - -void -kstat_waitq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt++; - if (wcnt != 0) { - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; - } -} -EXPORT_SYMBOL(kstat_waitq_enter); - -void -kstat_waitq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t wcnt; - - new = gethrtime(); - delta = new - kiop->wlastupdate; - kiop->wlastupdate = new; - wcnt = kiop->wcnt--; - ASSERT((int)wcnt > 0); - kiop->wlentime += delta * wcnt; - kiop->wtime += delta; -} -EXPORT_SYMBOL(kstat_waitq_exit); - -void -kstat_runq_enter(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt++; - if (rcnt != 0) { - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; - } -} -EXPORT_SYMBOL(kstat_runq_enter); - -void -kstat_runq_exit(kstat_io_t *kiop) -{ - hrtime_t new, delta; - ulong_t rcnt; - - new = gethrtime(); - delta = new - kiop->rlastupdate; - kiop->rlastupdate = new; - rcnt = kiop->rcnt--; - ASSERT((int)rcnt > 0); - kiop->rlentime += delta * rcnt; - kiop->rtime += delta; -} -EXPORT_SYMBOL(kstat_runq_exit); - -static int -kstat_seq_show_headers(struct seq_file *f) -{ - kstat_t *ksp = (kstat_t *)f->private; - int rc = 0; - - ASSERT(ksp->ks_magic == KS_MAGIC); - - seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", - ksp->ks_kid, ksp->ks_type, ksp->ks_flags, - ksp->ks_ndata, (int)ksp->ks_data_size, - ksp->ks_crtime, ksp->ks_snaptime); - - switch (ksp->ks_type) { - case KSTAT_TYPE_RAW: -restart: - if (ksp->ks_raw_ops.headers) { - rc = ksp->ks_raw_ops.headers( - ksp->ks_raw_buf, ksp->ks_raw_bufsize); - if (rc == ENOMEM && !kstat_resize_raw(ksp)) - goto restart; - if (!rc) - seq_puts(f, ksp->ks_raw_buf); - } else { - seq_printf(f, "raw data\n"); - } - break; - case KSTAT_TYPE_NAMED: - seq_printf(f, "%-31s %-4s %s\n", - "name", "type", "data"); - break; - case KSTAT_TYPE_INTR: - seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", - "hard", "soft", "watchdog", - "spurious", "multsvc"); - break; - case KSTAT_TYPE_IO: - seq_printf(f, - "%-8s %-8s %-8s %-8s %-8s %-8s " - "%-8s %-8s %-8s %-8s %-8s %-8s\n", - "nread", "nwritten", "reads", "writes", - "wtime", "wlentime", "wupdate", - "rtime", "rlentime", "rupdate", - "wcnt", "rcnt"); - break; - case KSTAT_TYPE_TIMER: - seq_printf(f, - "%-31s %-8s " - "%-8s %-8s %-8s %-8s %-8s\n", - "name", "events", "elapsed", - "min", "max", "start", "stop"); - break; - default: - PANIC("Undefined kstat type %d\n", ksp->ks_type); - } - - return (-rc); -} - -static int -kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) -{ - int i, j; - - for (i = 0; ; i++) { - seq_printf(f, "%03x:", i); - - for (j = 0; j < 16; j++) { - if (i * 16 + j >= l) { - seq_printf(f, "\n"); - goto out; - } - - seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); - } - seq_printf(f, "\n"); - } -out: - return (0); -} - -static int -kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) -{ - seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); - - switch (knp->data_type) { - case KSTAT_DATA_CHAR: - knp->value.c[15] = '\0'; /* NULL terminate */ - seq_printf(f, "%-16s", knp->value.c); - break; - /* - * NOTE - We need to be more careful able what tokens are - * used for each arch, for now this is correct for x86_64. - */ - case KSTAT_DATA_INT32: - seq_printf(f, "%d", knp->value.i32); - break; - case KSTAT_DATA_UINT32: - seq_printf(f, "%u", knp->value.ui32); - break; - case KSTAT_DATA_INT64: - seq_printf(f, "%lld", (signed long long)knp->value.i64); - break; - case KSTAT_DATA_UINT64: - seq_printf(f, "%llu", - (unsigned long long)knp->value.ui64); - break; - case KSTAT_DATA_LONG: - seq_printf(f, "%ld", knp->value.l); - break; - case KSTAT_DATA_ULONG: - seq_printf(f, "%lu", knp->value.ul); - break; - case KSTAT_DATA_STRING: - KSTAT_NAMED_STR_PTR(knp) - [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; - seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); - break; - default: - PANIC("Undefined kstat data type %d\n", knp->data_type); - } - - seq_printf(f, "\n"); - - return (0); -} - -static int -kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) -{ - seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", - kip->intrs[KSTAT_INTR_HARD], - kip->intrs[KSTAT_INTR_SOFT], - kip->intrs[KSTAT_INTR_WATCHDOG], - kip->intrs[KSTAT_INTR_SPURIOUS], - kip->intrs[KSTAT_INTR_MULTSVC]); - - return (0); -} - -static int -kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) -{ - /* though wlentime & friends are signed, they will never be negative */ - seq_printf(f, - "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " - "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", - kip->nread, kip->nwritten, - kip->reads, kip->writes, - kip->wtime, kip->wlentime, kip->wlastupdate, - kip->rtime, kip->rlentime, kip->rlastupdate, - kip->wcnt, kip->rcnt); - - return (0); -} - -static int -kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) -{ - seq_printf(f, - "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n", - ktp->name, ktp->num_events, ktp->elapsed_time, - ktp->min_time, ktp->max_time, - ktp->start_time, ktp->stop_time); - - return (0); -} - -static int -kstat_seq_show(struct seq_file *f, void *p) -{ - kstat_t *ksp = (kstat_t *)f->private; - int rc = 0; - - ASSERT(ksp->ks_magic == KS_MAGIC); - - switch (ksp->ks_type) { - case KSTAT_TYPE_RAW: -restart: - if (ksp->ks_raw_ops.data) { - rc = ksp->ks_raw_ops.data( - ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); - if (rc == ENOMEM && !kstat_resize_raw(ksp)) - goto restart; - if (!rc) - seq_puts(f, ksp->ks_raw_buf); - } else { - ASSERT(ksp->ks_ndata == 1); - rc = kstat_seq_show_raw(f, ksp->ks_data, - ksp->ks_data_size); - } - break; - case KSTAT_TYPE_NAMED: - rc = kstat_seq_show_named(f, (kstat_named_t *)p); - break; - case KSTAT_TYPE_INTR: - rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); - break; - case KSTAT_TYPE_IO: - rc = kstat_seq_show_io(f, (kstat_io_t *)p); - break; - case KSTAT_TYPE_TIMER: - rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); - break; - default: - PANIC("Undefined kstat type %d\n", ksp->ks_type); - } - - return (-rc); -} - -static int -kstat_default_update(kstat_t *ksp, int rw) -{ - ASSERT(ksp != NULL); - - if (rw == KSTAT_WRITE) - return (EACCES); - - return (0); -} - -static void * -kstat_seq_data_addr(kstat_t *ksp, loff_t n) -{ - void *rc = NULL; - - switch (ksp->ks_type) { - case KSTAT_TYPE_RAW: - if (ksp->ks_raw_ops.addr) - rc = ksp->ks_raw_ops.addr(ksp, n); - else - rc = ksp->ks_data; - break; - case KSTAT_TYPE_NAMED: - rc = ksp->ks_data + n * sizeof (kstat_named_t); - break; - case KSTAT_TYPE_INTR: - rc = ksp->ks_data + n * sizeof (kstat_intr_t); - break; - case KSTAT_TYPE_IO: - rc = ksp->ks_data + n * sizeof (kstat_io_t); - break; - case KSTAT_TYPE_TIMER: - rc = ksp->ks_data + n * sizeof (kstat_timer_t); - break; - default: - PANIC("Undefined kstat type %d\n", ksp->ks_type); - } - - return (rc); -} - -static void * -kstat_seq_start(struct seq_file *f, loff_t *pos) -{ - loff_t n = *pos; - kstat_t *ksp = (kstat_t *)f->private; - ASSERT(ksp->ks_magic == KS_MAGIC); - - mutex_enter(ksp->ks_lock); - - if (ksp->ks_type == KSTAT_TYPE_RAW) { - ksp->ks_raw_bufsize = PAGE_SIZE; - ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); - } - - /* Dynamically update kstat, on error existing kstats are used */ - (void) ksp->ks_update(ksp, KSTAT_READ); - - ksp->ks_snaptime = gethrtime(); - - if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n && - kstat_seq_show_headers(f)) - return (NULL); - - if (n >= ksp->ks_ndata) - return (NULL); - - return (kstat_seq_data_addr(ksp, n)); -} - -static void * -kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) -{ - kstat_t *ksp = (kstat_t *)f->private; - ASSERT(ksp->ks_magic == KS_MAGIC); - - ++*pos; - if (*pos >= ksp->ks_ndata) - return (NULL); - - return (kstat_seq_data_addr(ksp, *pos)); -} - -static void -kstat_seq_stop(struct seq_file *f, void *v) -{ - kstat_t *ksp = (kstat_t *)f->private; - ASSERT(ksp->ks_magic == KS_MAGIC); - - if (ksp->ks_type == KSTAT_TYPE_RAW) - vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); - - mutex_exit(ksp->ks_lock); -} - -static struct seq_operations kstat_seq_ops = { - .show = kstat_seq_show, - .start = kstat_seq_start, - .next = kstat_seq_next, - .stop = kstat_seq_stop, -}; - -static kstat_module_t * -kstat_find_module(char *name) -{ - kstat_module_t *module; - - list_for_each_entry(module, &kstat_module_list, ksm_module_list) { - if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) - return (module); - } - - return (NULL); -} - -static kstat_module_t * -kstat_create_module(char *name) -{ - kstat_module_t *module; - struct proc_dir_entry *pde; - - pde = proc_mkdir(name, proc_spl_kstat); - if (pde == NULL) - return (NULL); - - module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); - module->ksm_proc = pde; - strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); - INIT_LIST_HEAD(&module->ksm_kstat_list); - list_add_tail(&module->ksm_module_list, &kstat_module_list); - - return (module); - -} - -static void -kstat_delete_module(kstat_module_t *module) -{ - ASSERT(list_empty(&module->ksm_kstat_list)); - remove_proc_entry(module->ksm_name, proc_spl_kstat); - list_del(&module->ksm_module_list); - kmem_free(module, sizeof (kstat_module_t)); -} - -static int -proc_kstat_open(struct inode *inode, struct file *filp) -{ - struct seq_file *f; - int rc; - - rc = seq_open(filp, &kstat_seq_ops); - if (rc) - return (rc); - - f = filp->private_data; - f->private = PDE_DATA(inode); - - return (rc); -} - -static ssize_t -proc_kstat_write(struct file *filp, const char __user *buf, size_t len, - loff_t *ppos) -{ - struct seq_file *f = filp->private_data; - kstat_t *ksp = f->private; - int rc; - - ASSERT(ksp->ks_magic == KS_MAGIC); - - mutex_enter(ksp->ks_lock); - rc = ksp->ks_update(ksp, KSTAT_WRITE); - mutex_exit(ksp->ks_lock); - - if (rc) - return (-rc); - - *ppos += len; - return (len); -} - -static struct file_operations proc_kstat_operations = { - .open = proc_kstat_open, - .write = proc_kstat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -void -__kstat_set_raw_ops(kstat_t *ksp, - int (*headers)(char *buf, size_t size), - int (*data)(char *buf, size_t size, void *data), - void *(*addr)(kstat_t *ksp, loff_t index)) -{ - ksp->ks_raw_ops.headers = headers; - ksp->ks_raw_ops.data = data; - ksp->ks_raw_ops.addr = addr; -} -EXPORT_SYMBOL(__kstat_set_raw_ops); - -void -kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, - const char *name) -{ - kpep->kpe_owner = NULL; - kpep->kpe_proc = NULL; - INIT_LIST_HEAD(&kpep->kpe_list); - strncpy(kpep->kpe_module, module, KSTAT_STRLEN); - strncpy(kpep->kpe_name, name, KSTAT_STRLEN); -} -EXPORT_SYMBOL(kstat_proc_entry_init); - -kstat_t * -__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, - const char *ks_class, uchar_t ks_type, uint_t ks_ndata, - uchar_t ks_flags) -{ - kstat_t *ksp; - - ASSERT(ks_module); - ASSERT(ks_instance == 0); - ASSERT(ks_name); - - if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) - ASSERT(ks_ndata == 1); - - ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); - if (ksp == NULL) - return (ksp); - - mutex_enter(&kstat_module_lock); - ksp->ks_kid = kstat_id; - kstat_id++; - mutex_exit(&kstat_module_lock); - - ksp->ks_magic = KS_MAGIC; - mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); - ksp->ks_lock = &ksp->ks_private_lock; - - ksp->ks_crtime = gethrtime(); - ksp->ks_snaptime = ksp->ks_crtime; - ksp->ks_instance = ks_instance; - strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); - ksp->ks_type = ks_type; - ksp->ks_flags = ks_flags; - ksp->ks_update = kstat_default_update; - ksp->ks_private = NULL; - ksp->ks_raw_ops.headers = NULL; - ksp->ks_raw_ops.data = NULL; - ksp->ks_raw_ops.addr = NULL; - ksp->ks_raw_buf = NULL; - ksp->ks_raw_bufsize = 0; - kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); - - switch (ksp->ks_type) { - case KSTAT_TYPE_RAW: - ksp->ks_ndata = 1; - ksp->ks_data_size = ks_ndata; - break; - case KSTAT_TYPE_NAMED: - ksp->ks_ndata = ks_ndata; - ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); - break; - case KSTAT_TYPE_INTR: - ksp->ks_ndata = ks_ndata; - ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); - break; - case KSTAT_TYPE_IO: - ksp->ks_ndata = ks_ndata; - ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); - break; - case KSTAT_TYPE_TIMER: - ksp->ks_ndata = ks_ndata; - ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); - break; - default: - PANIC("Undefined kstat type %d\n", ksp->ks_type); - } - - if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { - ksp->ks_data = NULL; - } else { - ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); - if (ksp->ks_data == NULL) { - kmem_free(ksp, sizeof (*ksp)); - ksp = NULL; - } - } - - return (ksp); -} -EXPORT_SYMBOL(__kstat_create); - -static int -kstat_detect_collision(kstat_proc_entry_t *kpep) -{ - kstat_module_t *module; - kstat_proc_entry_t *tmp; - char *parent; - char *cp; - - parent = kmem_asprintf("%s", kpep->kpe_module); - - if ((cp = strrchr(parent, '/')) == NULL) { - strfree(parent); - return (0); - } - - cp[0] = '\0'; - if ((module = kstat_find_module(parent)) != NULL) { - list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { - if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { - strfree(parent); - return (EEXIST); - } - } - } - - strfree(parent); - return (0); -} - -/* - * Add a file to the proc filesystem under the kstat namespace (i.e. - * /proc/spl/kstat/). The file need not necessarily be implemented as a - * kstat. - */ -void -kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, - const struct file_operations *file_ops, void *data) -{ - kstat_module_t *module; - kstat_proc_entry_t *tmp; - - ASSERT(kpep); - - mutex_enter(&kstat_module_lock); - - module = kstat_find_module(kpep->kpe_module); - if (module == NULL) { - if (kstat_detect_collision(kpep) != 0) { - cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ - " collision", kpep->kpe_module, kpep->kpe_name); - goto out; - } - module = kstat_create_module(kpep->kpe_module); - if (module == NULL) - goto out; - } - - /* - * Only one entry by this name per-module, on failure the module - * shouldn't be deleted because we know it has at least one entry. - */ - list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { - if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) - goto out; - } - - list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); - - kpep->kpe_owner = module; - kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, - module->ksm_proc, file_ops, data); - if (kpep->kpe_proc == NULL) { - list_del_init(&kpep->kpe_list); - if (list_empty(&module->ksm_kstat_list)) - kstat_delete_module(module); - } -out: - mutex_exit(&kstat_module_lock); - -} -EXPORT_SYMBOL(kstat_proc_entry_install); - -void -__kstat_install(kstat_t *ksp) -{ - ASSERT(ksp); - mode_t mode; - /* Specify permission modes for different kstats */ - if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) { - mode = 0600; - } else { - mode = 0644; - } - kstat_proc_entry_install( - &ksp->ks_proc, mode, &proc_kstat_operations, ksp); -} -EXPORT_SYMBOL(__kstat_install); - -void -kstat_proc_entry_delete(kstat_proc_entry_t *kpep) -{ - kstat_module_t *module = kpep->kpe_owner; - if (kpep->kpe_proc) - remove_proc_entry(kpep->kpe_name, module->ksm_proc); - - mutex_enter(&kstat_module_lock); - list_del_init(&kpep->kpe_list); - - /* - * Remove top level module directory if it wasn't empty before, but now - * is. - */ - if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) - kstat_delete_module(module); - mutex_exit(&kstat_module_lock); - -} -EXPORT_SYMBOL(kstat_proc_entry_delete); - -void -__kstat_delete(kstat_t *ksp) -{ - kstat_proc_entry_delete(&ksp->ks_proc); - - if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) - kmem_free(ksp->ks_data, ksp->ks_data_size); - - ksp->ks_lock = NULL; - mutex_destroy(&ksp->ks_private_lock); - kmem_free(ksp, sizeof (*ksp)); -} -EXPORT_SYMBOL(__kstat_delete); - -int -spl_kstat_init(void) -{ - mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); - INIT_LIST_HEAD(&kstat_module_list); - kstat_id = 0; - return (0); -} - -void -spl_kstat_fini(void) -{ - ASSERT(list_empty(&kstat_module_list)); - mutex_destroy(&kstat_module_lock); -} diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c deleted file mode 100644 index a75bcc214..000000000 --- a/module/spl/spl-proc.c +++ /dev/null @@ -1,782 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Proc Implementation. - */ - -#include <sys/systeminfo.h> -#include <sys/kstat.h> -#include <sys/kmem.h> -#include <sys/kmem_cache.h> -#include <sys/vmem.h> -#include <sys/taskq.h> -#include <sys/proc.h> -#include <linux/ctype.h> -#include <linux/kmod.h> -#include <linux/seq_file.h> -#include <linux/uaccess.h> -#include <linux/version.h> - -#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) -typedef struct ctl_table __no_const spl_ctl_table; -#else -typedef struct ctl_table spl_ctl_table; -#endif - -static unsigned long table_min = 0; -static unsigned long table_max = ~0; - -static struct ctl_table_header *spl_header = NULL; -static struct proc_dir_entry *proc_spl = NULL; -static struct proc_dir_entry *proc_spl_kmem = NULL; -static struct proc_dir_entry *proc_spl_kmem_slab = NULL; -static struct proc_dir_entry *proc_spl_taskq_all = NULL; -static struct proc_dir_entry *proc_spl_taskq = NULL; -struct proc_dir_entry *proc_spl_kstat = NULL; - -static int -proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer, - int ubuffer_size) -{ - int size; - - if (ubuffer_size > kbuffer_size) - return (-EOVERFLOW); - - if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) - return (-EFAULT); - - /* strip trailing whitespace */ - size = strnlen(kbuffer, ubuffer_size); - while (size-- >= 0) - if (!isspace(kbuffer[size])) - break; - - /* empty string */ - if (size < 0) - return (-EINVAL); - - /* no space to terminate */ - if (size == kbuffer_size) - return (-EOVERFLOW); - - kbuffer[size + 1] = 0; - return (0); -} - -static int -proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer, - char *append) -{ - /* - * NB if 'append' != NULL, it's a single character to append to the - * copied out string - usually "\n", for /proc entries and - * (i.e. a terminating zero byte) for sysctl entries - */ - int size = MIN(strlen(kbuffer), ubuffer_size); - - if (copy_to_user(ubuffer, kbuffer, size)) - return (-EFAULT); - - if (append != NULL && size < ubuffer_size) { - if (copy_to_user(ubuffer + size, append, 1)) - return (-EFAULT); - - size++; - } - - return (size); -} - -#ifdef DEBUG_KMEM -static int -proc_domemused(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int rc = 0; - unsigned long min = 0, max = ~0, val; - spl_ctl_table dummy = *table; - - dummy.data = &val; - dummy.proc_handler = &proc_dointvec; - dummy.extra1 = &min; - dummy.extra2 = &max; - - if (write) { - *ppos += *lenp; - } else { -#ifdef HAVE_ATOMIC64_T - val = atomic64_read((atomic64_t *)table->data); -#else - val = atomic_read((atomic_t *)table->data); -#endif /* HAVE_ATOMIC64_T */ - rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); - } - - return (rc); -} -#endif /* DEBUG_KMEM */ - -static int -proc_doslab(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int rc = 0; - unsigned long min = 0, max = ~0, val = 0, mask; - spl_ctl_table dummy = *table; - spl_kmem_cache_t *skc; - - dummy.data = &val; - dummy.proc_handler = &proc_dointvec; - dummy.extra1 = &min; - dummy.extra2 = &max; - - if (write) { - *ppos += *lenp; - } else { - down_read(&spl_kmem_cache_sem); - mask = (unsigned long)table->data; - - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - - /* Only use slabs of the correct kmem/vmem type */ - if (!(skc->skc_flags & mask)) - continue; - - /* Sum the specified field for selected slabs */ - switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { - case KMC_TOTAL: - val += skc->skc_slab_size * skc->skc_slab_total; - break; - case KMC_ALLOC: - val += skc->skc_obj_size * skc->skc_obj_alloc; - break; - case KMC_MAX: - val += skc->skc_obj_size * skc->skc_obj_max; - break; - } - } - - up_read(&spl_kmem_cache_sem); - rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); - } - - return (rc); -} - -static int -proc_dohostid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int len, rc = 0; - char *end, str[32]; - - if (write) { - /* - * We can't use proc_doulongvec_minmax() in the write - * case here because hostid while a hex value has no - * leading 0x which confuses the helper function. - */ - rc = proc_copyin_string(str, sizeof (str), buffer, *lenp); - if (rc < 0) - return (rc); - - spl_hostid = simple_strtoul(str, &end, 16); - if (str == end) - return (-EINVAL); - - } else { - len = snprintf(str, sizeof (str), "%lx", - (unsigned long) zone_get_hostid(NULL)); - if (*ppos >= len) - rc = 0; - else - rc = proc_copyout_string(buffer, - *lenp, str + *ppos, "\n"); - - if (rc >= 0) { - *lenp = rc; - *ppos += rc; - } - } - - return (rc); -} - -static void -taskq_seq_show_headers(struct seq_file *f) -{ - seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", - "taskq", "act", "nthr", "spwn", "maxt", "pri", - "mina", "maxa", "cura", "flags"); -} - -/* indices into the lheads array below */ -#define LHEAD_PEND 0 -#define LHEAD_PRIO 1 -#define LHEAD_DELAY 2 -#define LHEAD_WAIT 3 -#define LHEAD_ACTIVE 4 -#define LHEAD_SIZE 5 - -/* BEGIN CSTYLED */ -static unsigned int spl_max_show_tasks = 512; -module_param(spl_max_show_tasks, uint, 0644); -MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); -/* END CSTYLED */ - -static int -taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) -{ - taskq_t *tq = p; - taskq_thread_t *tqt; - spl_wait_queue_entry_t *wq; - struct task_struct *tsk; - taskq_ent_t *tqe; - char name[100]; - struct list_head *lheads[LHEAD_SIZE], *lh; - static char *list_names[LHEAD_SIZE] = - {"pend", "prio", "delay", "wait", "active" }; - int i, j, have_lheads = 0; - unsigned long wflags, flags; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); - - /* get the various lists and check whether they're empty */ - lheads[LHEAD_PEND] = &tq->tq_pend_list; - lheads[LHEAD_PRIO] = &tq->tq_prio_list; - lheads[LHEAD_DELAY] = &tq->tq_delay_list; -#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY - lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; -#else - lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; -#endif - lheads[LHEAD_ACTIVE] = &tq->tq_active_list; - - for (i = 0; i < LHEAD_SIZE; ++i) { - if (list_empty(lheads[i])) - lheads[i] = NULL; - else - ++have_lheads; - } - - /* early return in non-"all" mode if lists are all empty */ - if (!allflag && !have_lheads) { - spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); - spin_unlock_irqrestore(&tq->tq_lock, flags); - return (0); - } - - /* unlock the waitq quickly */ - if (!lheads[LHEAD_WAIT]) - spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); - - /* show the base taskq contents */ - snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); - seq_printf(f, "%-25s ", name); - seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", - tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, - tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, - tq->tq_nalloc, tq->tq_flags); - - /* show the active list */ - if (lheads[LHEAD_ACTIVE]) { - j = 0; - list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { - if (j == 0) - seq_printf(f, "\t%s:", - list_names[LHEAD_ACTIVE]); - else if (j == 2) { - seq_printf(f, "\n\t "); - j = 0; - } - seq_printf(f, " [%d]%pf(%ps)", - tqt->tqt_thread->pid, - tqt->tqt_task->tqent_func, - tqt->tqt_task->tqent_arg); - ++j; - } - seq_printf(f, "\n"); - } - - for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) - if (lheads[i]) { - j = 0; - list_for_each(lh, lheads[i]) { - if (spl_max_show_tasks != 0 && - j >= spl_max_show_tasks) { - seq_printf(f, "\n\t(truncated)"); - break; - } - /* show the wait waitq list */ - if (i == LHEAD_WAIT) { -#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY - wq = list_entry(lh, - spl_wait_queue_entry_t, entry); -#else - wq = list_entry(lh, - spl_wait_queue_entry_t, task_list); -#endif - if (j == 0) - seq_printf(f, "\t%s:", - list_names[i]); - else if (j % 8 == 0) - seq_printf(f, "\n\t "); - - tsk = wq->private; - seq_printf(f, " %d", tsk->pid); - /* pend, prio and delay lists */ - } else { - tqe = list_entry(lh, taskq_ent_t, - tqent_list); - if (j == 0) - seq_printf(f, "\t%s:", - list_names[i]); - else if (j % 2 == 0) - seq_printf(f, "\n\t "); - - seq_printf(f, " %pf(%ps)", - tqe->tqent_func, - tqe->tqent_arg); - } - ++j; - } - seq_printf(f, "\n"); - } - if (lheads[LHEAD_WAIT]) - spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - return (0); -} - -static int -taskq_all_seq_show(struct seq_file *f, void *p) -{ - return (taskq_seq_show_impl(f, p, B_TRUE)); -} - -static int -taskq_seq_show(struct seq_file *f, void *p) -{ - return (taskq_seq_show_impl(f, p, B_FALSE)); -} - -static void * -taskq_seq_start(struct seq_file *f, loff_t *pos) -{ - struct list_head *p; - loff_t n = *pos; - - down_read(&tq_list_sem); - if (!n) - taskq_seq_show_headers(f); - - p = tq_list.next; - while (n--) { - p = p->next; - if (p == &tq_list) - return (NULL); - } - - return (list_entry(p, taskq_t, tq_taskqs)); -} - -static void * -taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) -{ - taskq_t *tq = p; - - ++*pos; - return ((tq->tq_taskqs.next == &tq_list) ? - NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); -} - -static void -slab_seq_show_headers(struct seq_file *f) -{ - seq_printf(f, - "--------------------- cache ----------" - "--------------------------------------------- " - "----- slab ------ " - "---- object ----- " - "--- emergency ---\n"); - seq_printf(f, - "name " - " flags size alloc slabsize objsize " - "total alloc max " - "total alloc max " - "dlock alloc max\n"); -} - -static int -slab_seq_show(struct seq_file *f, void *p) -{ - spl_kmem_cache_t *skc = p; - - ASSERT(skc->skc_magic == SKC_MAGIC); - - /* - * Backed by Linux slab see /proc/slabinfo. - */ - if (skc->skc_flags & KMC_SLAB) - return (0); - - spin_lock(&skc->skc_lock); - seq_printf(f, "%-36s ", skc->skc_name); - seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " - "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", - (long unsigned)skc->skc_flags, - (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), - (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), - (unsigned)skc->skc_slab_size, - (unsigned)skc->skc_obj_size, - (long unsigned)skc->skc_slab_total, - (long unsigned)skc->skc_slab_alloc, - (long unsigned)skc->skc_slab_max, - (long unsigned)skc->skc_obj_total, - (long unsigned)skc->skc_obj_alloc, - (long unsigned)skc->skc_obj_max, - (long unsigned)skc->skc_obj_deadlock, - (long unsigned)skc->skc_obj_emergency, - (long unsigned)skc->skc_obj_emergency_max); - - spin_unlock(&skc->skc_lock); - - return (0); -} - -static void * -slab_seq_start(struct seq_file *f, loff_t *pos) -{ - struct list_head *p; - loff_t n = *pos; - - down_read(&spl_kmem_cache_sem); - if (!n) - slab_seq_show_headers(f); - - p = spl_kmem_cache_list.next; - while (n--) { - p = p->next; - if (p == &spl_kmem_cache_list) - return (NULL); - } - - return (list_entry(p, spl_kmem_cache_t, skc_list)); -} - -static void * -slab_seq_next(struct seq_file *f, void *p, loff_t *pos) -{ - spl_kmem_cache_t *skc = p; - - ++*pos; - return ((skc->skc_list.next == &spl_kmem_cache_list) ? - NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list)); -} - -static void -slab_seq_stop(struct seq_file *f, void *v) -{ - up_read(&spl_kmem_cache_sem); -} - -static struct seq_operations slab_seq_ops = { - .show = slab_seq_show, - .start = slab_seq_start, - .next = slab_seq_next, - .stop = slab_seq_stop, -}; - -static int -proc_slab_open(struct inode *inode, struct file *filp) -{ - return (seq_open(filp, &slab_seq_ops)); -} - -static struct file_operations proc_slab_operations = { - .open = proc_slab_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void -taskq_seq_stop(struct seq_file *f, void *v) -{ - up_read(&tq_list_sem); -} - -static struct seq_operations taskq_all_seq_ops = { - .show = taskq_all_seq_show, - .start = taskq_seq_start, - .next = taskq_seq_next, - .stop = taskq_seq_stop, -}; - -static struct seq_operations taskq_seq_ops = { - .show = taskq_seq_show, - .start = taskq_seq_start, - .next = taskq_seq_next, - .stop = taskq_seq_stop, -}; - -static int -proc_taskq_all_open(struct inode *inode, struct file *filp) -{ - return (seq_open(filp, &taskq_all_seq_ops)); -} - -static int -proc_taskq_open(struct inode *inode, struct file *filp) -{ - return (seq_open(filp, &taskq_seq_ops)); -} - -static struct file_operations proc_taskq_all_operations = { - .open = proc_taskq_all_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static struct file_operations proc_taskq_operations = { - .open = proc_taskq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static struct ctl_table spl_kmem_table[] = { -#ifdef DEBUG_KMEM - { - .procname = "kmem_used", - .data = &kmem_alloc_used, -#ifdef HAVE_ATOMIC64_T - .maxlen = sizeof (atomic64_t), -#else - .maxlen = sizeof (atomic_t), -#endif /* HAVE_ATOMIC64_T */ - .mode = 0444, - .proc_handler = &proc_domemused, - }, - { - .procname = "kmem_max", - .data = &kmem_alloc_max, - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doulongvec_minmax, - }, -#endif /* DEBUG_KMEM */ - { - .procname = "slab_kmem_total", - .data = (void *)(KMC_KMEM | KMC_TOTAL), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_kmem_alloc", - .data = (void *)(KMC_KMEM | KMC_ALLOC), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_kmem_max", - .data = (void *)(KMC_KMEM | KMC_MAX), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_vmem_total", - .data = (void *)(KMC_VMEM | KMC_TOTAL), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_vmem_alloc", - .data = (void *)(KMC_VMEM | KMC_ALLOC), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - { - .procname = "slab_vmem_max", - .data = (void *)(KMC_VMEM | KMC_MAX), - .maxlen = sizeof (unsigned long), - .extra1 = &table_min, - .extra2 = &table_max, - .mode = 0444, - .proc_handler = &proc_doslab, - }, - {}, -}; - -static struct ctl_table spl_kstat_table[] = { - {}, -}; - -static struct ctl_table spl_table[] = { - /* - * NB No .strategy entries have been provided since - * sysctl(8) prefers to go via /proc for portability. - */ - { - .procname = "gitrev", - .data = spl_gitrev, - .maxlen = sizeof (spl_gitrev), - .mode = 0444, - .proc_handler = &proc_dostring, - }, - { - .procname = "hostid", - .data = &spl_hostid, - .maxlen = sizeof (unsigned long), - .mode = 0644, - .proc_handler = &proc_dohostid, - }, - { - .procname = "kmem", - .mode = 0555, - .child = spl_kmem_table, - }, - { - .procname = "kstat", - .mode = 0555, - .child = spl_kstat_table, - }, - {}, -}; - -static struct ctl_table spl_dir[] = { - { - .procname = "spl", - .mode = 0555, - .child = spl_table, - }, - {} -}; - -static struct ctl_table spl_root[] = { - { -#ifdef HAVE_CTL_NAME - .ctl_name = CTL_KERN, -#endif - .procname = "kernel", - .mode = 0555, - .child = spl_dir, - }, - {} -}; - -int -spl_proc_init(void) -{ - int rc = 0; - - spl_header = register_sysctl_table(spl_root); - if (spl_header == NULL) - return (-EUNATCH); - - proc_spl = proc_mkdir("spl", NULL); - if (proc_spl == NULL) { - rc = -EUNATCH; - goto out; - } - - proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, - &proc_taskq_all_operations, NULL); - if (proc_spl_taskq_all == NULL) { - rc = -EUNATCH; - goto out; - } - - proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, - &proc_taskq_operations, NULL); - if (proc_spl_taskq == NULL) { - rc = -EUNATCH; - goto out; - } - - proc_spl_kmem = proc_mkdir("kmem", proc_spl); - if (proc_spl_kmem == NULL) { - rc = -EUNATCH; - goto out; - } - - proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem, - &proc_slab_operations, NULL); - if (proc_spl_kmem_slab == NULL) { - rc = -EUNATCH; - goto out; - } - - proc_spl_kstat = proc_mkdir("kstat", proc_spl); - if (proc_spl_kstat == NULL) { - rc = -EUNATCH; - goto out; - } -out: - if (rc) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - unregister_sysctl_table(spl_header); - } - - return (rc); -} - -void -spl_proc_fini(void) -{ - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - - ASSERT(spl_header != NULL); - unregister_sysctl_table(spl_header); -} diff --git a/module/spl/spl-procfs-list.c b/module/spl/spl-procfs-list.c deleted file mode 100644 index f6a00da5c..000000000 --- a/module/spl/spl-procfs-list.c +++ /dev/null @@ -1,257 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2018 by Delphix. All rights reserved. - */ - -#include <sys/list.h> -#include <sys/mutex.h> -#include <sys/procfs_list.h> -#include <linux/proc_fs.h> - -/* - * A procfs_list is a wrapper around a linked list which implements the seq_file - * interface, allowing the contents of the list to be exposed through procfs. - * The kernel already has some utilities to help implement the seq_file - * interface for linked lists (seq_list_*), but they aren't appropriate for use - * with lists that have many entries, because seq_list_start walks the list at - * the start of each read syscall to find where it left off, so reading a file - * ends up being quadratic in the number of entries in the list. - * - * This implementation avoids this penalty by maintaining a separate cursor into - * the list per instance of the file that is open. It also maintains some extra - * information in each node of the list to prevent reads of entries that have - * been dropped from the list. - * - * Callers should only add elements to the list using procfs_list_add, which - * adds an element to the tail of the list. Other operations can be performed - * directly on the wrapped list using the normal list manipulation functions, - * but elements should only be removed from the head of the list. - */ - -#define NODE_ID(procfs_list, obj) \ - (((procfs_list_node_t *)(((char *)obj) + \ - (procfs_list)->pl_node_offset))->pln_id) - -typedef struct procfs_list_cursor { - procfs_list_t *procfs_list; /* List into which this cursor points */ - void *cached_node; /* Most recently accessed node */ - loff_t cached_pos; /* Position of cached_node */ -} procfs_list_cursor_t; - -static int -procfs_list_seq_show(struct seq_file *f, void *p) -{ - procfs_list_cursor_t *cursor = f->private; - procfs_list_t *procfs_list = cursor->procfs_list; - - ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); - if (p == SEQ_START_TOKEN) { - if (procfs_list->pl_show_header != NULL) - return (procfs_list->pl_show_header(f)); - else - return (0); - } - return (procfs_list->pl_show(f, p)); -} - -static void * -procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos) -{ - void *next_node; - procfs_list_t *procfs_list = cursor->procfs_list; - - if (cursor->cached_node == SEQ_START_TOKEN) - next_node = list_head(&procfs_list->pl_list); - else - next_node = list_next(&procfs_list->pl_list, - cursor->cached_node); - - if (next_node != NULL) { - cursor->cached_node = next_node; - cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node); - *pos = cursor->cached_pos; - } - return (next_node); -} - -static void * -procfs_list_seq_start(struct seq_file *f, loff_t *pos) -{ - procfs_list_cursor_t *cursor = f->private; - procfs_list_t *procfs_list = cursor->procfs_list; - - mutex_enter(&procfs_list->pl_lock); - - if (*pos == 0) { - cursor->cached_node = SEQ_START_TOKEN; - cursor->cached_pos = 0; - return (SEQ_START_TOKEN); - } - - /* - * Check if our cached pointer has become stale, which happens if the - * the message where we left off has been dropped from the list since - * the last read syscall completed. - */ - void *oldest_node = list_head(&procfs_list->pl_list); - if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL || - NODE_ID(procfs_list, oldest_node) > cursor->cached_pos)) - return (ERR_PTR(-EIO)); - - /* - * If it isn't starting from the beginning of the file, the seq_file - * code will either pick up at the same position it visited last or the - * following one. - */ - if (*pos == cursor->cached_pos) { - return (cursor->cached_node); - } else { - ASSERT3U(*pos, ==, cursor->cached_pos + 1); - return (procfs_list_next_node(cursor, pos)); - } -} - -static void * -procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos) -{ - procfs_list_cursor_t *cursor = f->private; - ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock)); - return (procfs_list_next_node(cursor, pos)); -} - -static void -procfs_list_seq_stop(struct seq_file *f, void *p) -{ - procfs_list_cursor_t *cursor = f->private; - procfs_list_t *procfs_list = cursor->procfs_list; - mutex_exit(&procfs_list->pl_lock); -} - -static struct seq_operations procfs_list_seq_ops = { - .show = procfs_list_seq_show, - .start = procfs_list_seq_start, - .next = procfs_list_seq_next, - .stop = procfs_list_seq_stop, -}; - -static int -procfs_list_open(struct inode *inode, struct file *filp) -{ - int rc = seq_open_private(filp, &procfs_list_seq_ops, - sizeof (procfs_list_cursor_t)); - if (rc != 0) - return (rc); - - struct seq_file *f = filp->private_data; - procfs_list_cursor_t *cursor = f->private; - cursor->procfs_list = PDE_DATA(inode); - cursor->cached_node = NULL; - cursor->cached_pos = 0; - - return (0); -} - -static ssize_t -procfs_list_write(struct file *filp, const char __user *buf, size_t len, - loff_t *ppos) -{ - struct seq_file *f = filp->private_data; - procfs_list_cursor_t *cursor = f->private; - procfs_list_t *procfs_list = cursor->procfs_list; - int rc; - - if (procfs_list->pl_clear != NULL && - (rc = procfs_list->pl_clear(procfs_list)) != 0) - return (-rc); - return (len); -} - -static struct file_operations procfs_list_operations = { - .owner = THIS_MODULE, - .open = procfs_list_open, - .write = procfs_list_write, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -/* - * Initialize a procfs_list and create a file for it in the proc filesystem - * under the kstat namespace. - */ -void -procfs_list_install(const char *module, - const char *name, - mode_t mode, - procfs_list_t *procfs_list, - int (*show)(struct seq_file *f, void *p), - int (*show_header)(struct seq_file *f), - int (*clear)(procfs_list_t *procfs_list), - size_t procfs_list_node_off) -{ - mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&procfs_list->pl_list, - procfs_list_node_off + sizeof (procfs_list_node_t), - procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); - procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */ - procfs_list->pl_show = show; - procfs_list->pl_show_header = show_header; - procfs_list->pl_clear = clear; - procfs_list->pl_node_offset = procfs_list_node_off; - - kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name); - kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode, - &procfs_list_operations, procfs_list); -} -EXPORT_SYMBOL(procfs_list_install); - -/* Remove the proc filesystem file corresponding to the given list */ -void -procfs_list_uninstall(procfs_list_t *procfs_list) -{ - kstat_proc_entry_delete(&procfs_list->pl_kstat_entry); -} -EXPORT_SYMBOL(procfs_list_uninstall); - -void -procfs_list_destroy(procfs_list_t *procfs_list) -{ - ASSERT(list_is_empty(&procfs_list->pl_list)); - list_destroy(&procfs_list->pl_list); - mutex_destroy(&procfs_list->pl_lock); -} -EXPORT_SYMBOL(procfs_list_destroy); - -/* - * Add a new node to the tail of the list. While the standard list manipulation - * functions can be use for all other operation, adding elements to the list - * should only be done using this helper so that the id of the new node is set - * correctly. - */ -void -procfs_list_add(procfs_list_t *procfs_list, void *p) -{ - ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); - NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; - list_insert_tail(&procfs_list->pl_list, p); -} -EXPORT_SYMBOL(procfs_list_add); diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c deleted file mode 100644 index 90e1d0a4d..000000000 --- a/module/spl/spl-taskq.c +++ /dev/null @@ -1,1292 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Task Queue Implementation. - */ - -#include <sys/timer.h> -#include <sys/taskq.h> -#include <sys/kmem.h> -#include <sys/tsd.h> -#include <sys/simd.h> - -int spl_taskq_thread_bind = 0; -module_param(spl_taskq_thread_bind, int, 0644); -MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); - - -int spl_taskq_thread_dynamic = 1; -module_param(spl_taskq_thread_dynamic, int, 0644); -MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); - -int spl_taskq_thread_priority = 1; -module_param(spl_taskq_thread_priority, int, 0644); -MODULE_PARM_DESC(spl_taskq_thread_priority, - "Allow non-default priority for taskq threads"); - -int spl_taskq_thread_sequential = 4; -module_param(spl_taskq_thread_sequential, int, 0644); -MODULE_PARM_DESC(spl_taskq_thread_sequential, - "Create new taskq threads after N sequential tasks"); - -/* Global system-wide dynamic task queue available for all consumers */ -taskq_t *system_taskq; -EXPORT_SYMBOL(system_taskq); -/* Global dynamic task queue for long delay */ -taskq_t *system_delay_taskq; -EXPORT_SYMBOL(system_delay_taskq); - -/* Private dedicated taskq for creating new taskq threads on demand. */ -static taskq_t *dynamic_taskq; -static taskq_thread_t *taskq_thread_create(taskq_t *); - -/* List of all taskqs */ -LIST_HEAD(tq_list); -struct rw_semaphore tq_list_sem; -static uint_t taskq_tsd; - -static int -task_km_flags(uint_t flags) -{ - if (flags & TQ_NOSLEEP) - return (KM_NOSLEEP); - - if (flags & TQ_PUSHPAGE) - return (KM_PUSHPAGE); - - return (KM_SLEEP); -} - -/* - * taskq_find_by_name - Find the largest instance number of a named taskq. - */ -static int -taskq_find_by_name(const char *name) -{ - struct list_head *tql; - taskq_t *tq; - - list_for_each_prev(tql, &tq_list) { - tq = list_entry(tql, taskq_t, tq_taskqs); - if (strcmp(name, tq->tq_name) == 0) - return (tq->tq_instance); - } - return (-1); -} - -/* - * NOTE: Must be called with tq->tq_lock held, returns a list_t which - * is not attached to the free, work, or pending taskq lists. - */ -static taskq_ent_t * -task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) -{ - taskq_ent_t *t; - int count = 0; - - ASSERT(tq); -retry: - /* Acquire taskq_ent_t's from free list if available */ - if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { - t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); - - ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); - ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); - ASSERT(!timer_pending(&t->tqent_timer)); - - list_del_init(&t->tqent_list); - return (t); - } - - /* Free list is empty and memory allocations are prohibited */ - if (flags & TQ_NOALLOC) - return (NULL); - - /* Hit maximum taskq_ent_t pool size */ - if (tq->tq_nalloc >= tq->tq_maxalloc) { - if (flags & TQ_NOSLEEP) - return (NULL); - - /* - * Sleep periodically polling the free list for an available - * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed - * but we cannot block forever waiting for an taskq_ent_t to - * show up in the free list, otherwise a deadlock can happen. - * - * Therefore, we need to allocate a new task even if the number - * of allocated tasks is above tq->tq_maxalloc, but we still - * end up delaying the task allocation by one second, thereby - * throttling the task dispatch rate. - */ - spin_unlock_irqrestore(&tq->tq_lock, *irqflags); - schedule_timeout(HZ / 100); - spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, - tq->tq_lock_class); - if (count < 100) { - count++; - goto retry; - } - } - - spin_unlock_irqrestore(&tq->tq_lock, *irqflags); - t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); - spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); - - if (t) { - taskq_init_ent(t); - tq->tq_nalloc++; - } - - return (t); -} - -/* - * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t - * to already be removed from the free, work, or pending taskq lists. - */ -static void -task_free(taskq_t *tq, taskq_ent_t *t) -{ - ASSERT(tq); - ASSERT(t); - ASSERT(list_empty(&t->tqent_list)); - ASSERT(!timer_pending(&t->tqent_timer)); - - kmem_free(t, sizeof (taskq_ent_t)); - tq->tq_nalloc--; -} - -/* - * NOTE: Must be called with tq->tq_lock held, either destroys the - * taskq_ent_t if too many exist or moves it to the free list for later use. - */ -static void -task_done(taskq_t *tq, taskq_ent_t *t) -{ - ASSERT(tq); - ASSERT(t); - - /* Wake tasks blocked in taskq_wait_id() */ - wake_up_all(&t->tqent_waitq); - - list_del_init(&t->tqent_list); - - if (tq->tq_nalloc <= tq->tq_minalloc) { - t->tqent_id = TASKQID_INVALID; - t->tqent_func = NULL; - t->tqent_arg = NULL; - t->tqent_flags = 0; - - list_add_tail(&t->tqent_list, &tq->tq_free_list); - } else { - task_free(tq, t); - } -} - -/* - * When a delayed task timer expires remove it from the delay list and - * add it to the priority list in order for immediate processing. - */ -static void -task_expire_impl(taskq_ent_t *t) -{ - taskq_ent_t *w; - taskq_t *tq = t->tqent_taskq; - struct list_head *l; - unsigned long flags; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - - if (t->tqent_flags & TQENT_FLAG_CANCEL) { - ASSERT(list_empty(&t->tqent_list)); - spin_unlock_irqrestore(&tq->tq_lock, flags); - return; - } - - t->tqent_birth = jiffies; - /* - * The priority list must be maintained in strict task id order - * from lowest to highest for lowest_id to be easily calculable. - */ - list_del(&t->tqent_list); - list_for_each_prev(l, &tq->tq_prio_list) { - w = list_entry(l, taskq_ent_t, tqent_list); - if (w->tqent_id < t->tqent_id) { - list_add(&t->tqent_list, l); - break; - } - } - if (l == &tq->tq_prio_list) - list_add(&t->tqent_list, &tq->tq_prio_list); - - spin_unlock_irqrestore(&tq->tq_lock, flags); - - wake_up(&tq->tq_work_waitq); -} - -static void -task_expire(spl_timer_list_t tl) -{ - struct timer_list *tmr = (struct timer_list *)tl; - taskq_ent_t *t = from_timer(t, tmr, tqent_timer); - task_expire_impl(t); -} - -/* - * Returns the lowest incomplete taskqid_t. The taskqid_t may - * be queued on the pending list, on the priority list, on the - * delay list, or on the work list currently being handled, but - * it is not 100% complete yet. - */ -static taskqid_t -taskq_lowest_id(taskq_t *tq) -{ - taskqid_t lowest_id = tq->tq_next_id; - taskq_ent_t *t; - taskq_thread_t *tqt; - - ASSERT(tq); - - if (!list_empty(&tq->tq_pend_list)) { - t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); - lowest_id = MIN(lowest_id, t->tqent_id); - } - - if (!list_empty(&tq->tq_prio_list)) { - t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); - lowest_id = MIN(lowest_id, t->tqent_id); - } - - if (!list_empty(&tq->tq_delay_list)) { - t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); - lowest_id = MIN(lowest_id, t->tqent_id); - } - - if (!list_empty(&tq->tq_active_list)) { - tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, - tqt_active_list); - ASSERT(tqt->tqt_id != TASKQID_INVALID); - lowest_id = MIN(lowest_id, tqt->tqt_id); - } - - return (lowest_id); -} - -/* - * Insert a task into a list keeping the list sorted by increasing taskqid. - */ -static void -taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) -{ - taskq_thread_t *w; - struct list_head *l; - - ASSERT(tq); - ASSERT(tqt); - - list_for_each_prev(l, &tq->tq_active_list) { - w = list_entry(l, taskq_thread_t, tqt_active_list); - if (w->tqt_id < tqt->tqt_id) { - list_add(&tqt->tqt_active_list, l); - break; - } - } - if (l == &tq->tq_active_list) - list_add(&tqt->tqt_active_list, &tq->tq_active_list); -} - -/* - * Find and return a task from the given list if it exists. The list - * must be in lowest to highest task id order. - */ -static taskq_ent_t * -taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) -{ - struct list_head *l; - taskq_ent_t *t; - - list_for_each(l, lh) { - t = list_entry(l, taskq_ent_t, tqent_list); - - if (t->tqent_id == id) - return (t); - - if (t->tqent_id > id) - break; - } - - return (NULL); -} - -/* - * Find an already dispatched task given the task id regardless of what - * state it is in. If a task is still pending it will be returned. - * If a task is executing, then -EBUSY will be returned instead. - * If the task has already been run then NULL is returned. - */ -static taskq_ent_t * -taskq_find(taskq_t *tq, taskqid_t id) -{ - taskq_thread_t *tqt; - struct list_head *l; - taskq_ent_t *t; - - t = taskq_find_list(tq, &tq->tq_delay_list, id); - if (t) - return (t); - - t = taskq_find_list(tq, &tq->tq_prio_list, id); - if (t) - return (t); - - t = taskq_find_list(tq, &tq->tq_pend_list, id); - if (t) - return (t); - - list_for_each(l, &tq->tq_active_list) { - tqt = list_entry(l, taskq_thread_t, tqt_active_list); - if (tqt->tqt_id == id) { - /* - * Instead of returning tqt_task, we just return a non - * NULL value to prevent misuse, since tqt_task only - * has two valid fields. - */ - return (ERR_PTR(-EBUSY)); - } - } - - return (NULL); -} - -/* - * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and - * taskq_wait() functions below. - * - * Taskq waiting is accomplished by tracking the lowest outstanding task - * id and the next available task id. As tasks are dispatched they are - * added to the tail of the pending, priority, or delay lists. As worker - * threads become available the tasks are removed from the heads of these - * lists and linked to the worker threads. This ensures the lists are - * kept sorted by lowest to highest task id. - * - * Therefore the lowest outstanding task id can be quickly determined by - * checking the head item from all of these lists. This value is stored - * with the taskq as the lowest id. It only needs to be recalculated when - * either the task with the current lowest id completes or is canceled. - * - * By blocking until the lowest task id exceeds the passed task id the - * taskq_wait_outstanding() function can be easily implemented. Similarly, - * by blocking until the lowest task id matches the next task id taskq_wait() - * can be implemented. - * - * Callers should be aware that when there are multiple worked threads it - * is possible for larger task ids to complete before smaller ones. Also - * when the taskq contains delay tasks with small task ids callers may - * block for a considerable length of time waiting for them to expire and - * execute. - */ -static int -taskq_wait_id_check(taskq_t *tq, taskqid_t id) -{ - int rc; - unsigned long flags; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - rc = (taskq_find(tq, id) == NULL); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - return (rc); -} - -/* - * The taskq_wait_id() function blocks until the passed task id completes. - * This does not guarantee that all lower task ids have completed. - */ -void -taskq_wait_id(taskq_t *tq, taskqid_t id) -{ - wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); -} -EXPORT_SYMBOL(taskq_wait_id); - -static int -taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) -{ - int rc; - unsigned long flags; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - rc = (id < tq->tq_lowest_id); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - return (rc); -} - -/* - * The taskq_wait_outstanding() function will block until all tasks with a - * lower taskqid than the passed 'id' have been completed. Note that all - * task id's are assigned monotonically at dispatch time. Zero may be - * passed for the id to indicate all tasks dispatch up to this point, - * but not after, should be waited for. - */ -void -taskq_wait_outstanding(taskq_t *tq, taskqid_t id) -{ - id = id ? id : tq->tq_next_id - 1; - wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); -} -EXPORT_SYMBOL(taskq_wait_outstanding); - -static int -taskq_wait_check(taskq_t *tq) -{ - int rc; - unsigned long flags; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - rc = (tq->tq_lowest_id == tq->tq_next_id); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - return (rc); -} - -/* - * The taskq_wait() function will block until the taskq is empty. - * This means that if a taskq re-dispatches work to itself taskq_wait() - * callers will block indefinitely. - */ -void -taskq_wait(taskq_t *tq) -{ - wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); -} -EXPORT_SYMBOL(taskq_wait); - -int -taskq_member(taskq_t *tq, kthread_t *t) -{ - return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); -} -EXPORT_SYMBOL(taskq_member); - -/* - * Cancel an already dispatched task given the task id. Still pending tasks - * will be immediately canceled, and if the task is active the function will - * block until it completes. Preallocated tasks which are canceled must be - * freed by the caller. - */ -int -taskq_cancel_id(taskq_t *tq, taskqid_t id) -{ - taskq_ent_t *t; - int rc = ENOENT; - unsigned long flags; - - ASSERT(tq); - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - t = taskq_find(tq, id); - if (t && t != ERR_PTR(-EBUSY)) { - list_del_init(&t->tqent_list); - t->tqent_flags |= TQENT_FLAG_CANCEL; - - /* - * When canceling the lowest outstanding task id we - * must recalculate the new lowest outstanding id. - */ - if (tq->tq_lowest_id == t->tqent_id) { - tq->tq_lowest_id = taskq_lowest_id(tq); - ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); - } - - /* - * The task_expire() function takes the tq->tq_lock so drop - * drop the lock before synchronously cancelling the timer. - */ - if (timer_pending(&t->tqent_timer)) { - spin_unlock_irqrestore(&tq->tq_lock, flags); - del_timer_sync(&t->tqent_timer); - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - } - - if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) - task_done(tq, t); - - rc = 0; - } - spin_unlock_irqrestore(&tq->tq_lock, flags); - - if (t == ERR_PTR(-EBUSY)) { - taskq_wait_id(tq, id); - rc = EBUSY; - } - - return (rc); -} -EXPORT_SYMBOL(taskq_cancel_id); - -static int taskq_thread_spawn(taskq_t *tq); - -taskqid_t -taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) -{ - taskq_ent_t *t; - taskqid_t rc = TASKQID_INVALID; - unsigned long irqflags; - - ASSERT(tq); - ASSERT(func); - - spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); - - /* Taskq being destroyed and all tasks drained */ - if (!(tq->tq_flags & TASKQ_ACTIVE)) - goto out; - - /* Do not queue the task unless there is idle thread for it */ - ASSERT(tq->tq_nactive <= tq->tq_nthreads); - if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { - /* Dynamic taskq may be able to spawn another thread */ - if (!(tq->tq_flags & TASKQ_DYNAMIC) || - taskq_thread_spawn(tq) == 0) - goto out; - } - - if ((t = task_alloc(tq, flags, &irqflags)) == NULL) - goto out; - - spin_lock(&t->tqent_lock); - - /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ - if (flags & TQ_NOQUEUE) - list_add(&t->tqent_list, &tq->tq_prio_list); - /* Queue to the priority list instead of the pending list */ - else if (flags & TQ_FRONT) - list_add_tail(&t->tqent_list, &tq->tq_prio_list); - else - list_add_tail(&t->tqent_list, &tq->tq_pend_list); - - t->tqent_id = rc = tq->tq_next_id; - tq->tq_next_id++; - t->tqent_func = func; - t->tqent_arg = arg; - t->tqent_taskq = tq; - t->tqent_timer.function = NULL; - t->tqent_timer.expires = 0; - t->tqent_birth = jiffies; - - ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); - - spin_unlock(&t->tqent_lock); - - wake_up(&tq->tq_work_waitq); -out: - /* Spawn additional taskq threads if required. */ - if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) - (void) taskq_thread_spawn(tq); - - spin_unlock_irqrestore(&tq->tq_lock, irqflags); - return (rc); -} -EXPORT_SYMBOL(taskq_dispatch); - -taskqid_t -taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, - uint_t flags, clock_t expire_time) -{ - taskqid_t rc = TASKQID_INVALID; - taskq_ent_t *t; - unsigned long irqflags; - - ASSERT(tq); - ASSERT(func); - - spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); - - /* Taskq being destroyed and all tasks drained */ - if (!(tq->tq_flags & TASKQ_ACTIVE)) - goto out; - - if ((t = task_alloc(tq, flags, &irqflags)) == NULL) - goto out; - - spin_lock(&t->tqent_lock); - - /* Queue to the delay list for subsequent execution */ - list_add_tail(&t->tqent_list, &tq->tq_delay_list); - - t->tqent_id = rc = tq->tq_next_id; - tq->tq_next_id++; - t->tqent_func = func; - t->tqent_arg = arg; - t->tqent_taskq = tq; - t->tqent_timer.function = task_expire; - t->tqent_timer.expires = (unsigned long)expire_time; - add_timer(&t->tqent_timer); - - ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); - - spin_unlock(&t->tqent_lock); -out: - /* Spawn additional taskq threads if required. */ - if (tq->tq_nactive == tq->tq_nthreads) - (void) taskq_thread_spawn(tq); - spin_unlock_irqrestore(&tq->tq_lock, irqflags); - return (rc); -} -EXPORT_SYMBOL(taskq_dispatch_delay); - -void -taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, - taskq_ent_t *t) -{ - unsigned long irqflags; - ASSERT(tq); - ASSERT(func); - - spin_lock_irqsave_nested(&tq->tq_lock, irqflags, - tq->tq_lock_class); - - /* Taskq being destroyed and all tasks drained */ - if (!(tq->tq_flags & TASKQ_ACTIVE)) { - t->tqent_id = TASKQID_INVALID; - goto out; - } - - if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { - /* Dynamic taskq may be able to spawn another thread */ - if (!(tq->tq_flags & TASKQ_DYNAMIC) || - taskq_thread_spawn(tq) == 0) - goto out2; - flags |= TQ_FRONT; - } - - spin_lock(&t->tqent_lock); - - /* - * Make sure the entry is not on some other taskq; it is important to - * ASSERT() under lock - */ - ASSERT(taskq_empty_ent(t)); - - /* - * Mark it as a prealloc'd task. This is important - * to ensure that we don't free it later. - */ - t->tqent_flags |= TQENT_FLAG_PREALLOC; - - /* Queue to the priority list instead of the pending list */ - if (flags & TQ_FRONT) - list_add_tail(&t->tqent_list, &tq->tq_prio_list); - else - list_add_tail(&t->tqent_list, &tq->tq_pend_list); - - t->tqent_id = tq->tq_next_id; - tq->tq_next_id++; - t->tqent_func = func; - t->tqent_arg = arg; - t->tqent_taskq = tq; - t->tqent_birth = jiffies; - - spin_unlock(&t->tqent_lock); - - wake_up(&tq->tq_work_waitq); -out: - /* Spawn additional taskq threads if required. */ - if (tq->tq_nactive == tq->tq_nthreads) - (void) taskq_thread_spawn(tq); -out2: - spin_unlock_irqrestore(&tq->tq_lock, irqflags); -} -EXPORT_SYMBOL(taskq_dispatch_ent); - -int -taskq_empty_ent(taskq_ent_t *t) -{ - return (list_empty(&t->tqent_list)); -} -EXPORT_SYMBOL(taskq_empty_ent); - -void -taskq_init_ent(taskq_ent_t *t) -{ - spin_lock_init(&t->tqent_lock); - init_waitqueue_head(&t->tqent_waitq); - timer_setup(&t->tqent_timer, NULL, 0); - INIT_LIST_HEAD(&t->tqent_list); - t->tqent_id = 0; - t->tqent_func = NULL; - t->tqent_arg = NULL; - t->tqent_flags = 0; - t->tqent_taskq = NULL; -} -EXPORT_SYMBOL(taskq_init_ent); - -/* - * Return the next pending task, preference is given to tasks on the - * priority list which were dispatched with TQ_FRONT. - */ -static taskq_ent_t * -taskq_next_ent(taskq_t *tq) -{ - struct list_head *list; - - if (!list_empty(&tq->tq_prio_list)) - list = &tq->tq_prio_list; - else if (!list_empty(&tq->tq_pend_list)) - list = &tq->tq_pend_list; - else - return (NULL); - - return (list_entry(list->next, taskq_ent_t, tqent_list)); -} - -/* - * Spawns a new thread for the specified taskq. - */ -static void -taskq_thread_spawn_task(void *arg) -{ - taskq_t *tq = (taskq_t *)arg; - unsigned long flags; - - if (taskq_thread_create(tq) == NULL) { - /* restore spawning count if failed */ - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - tq->tq_nspawn--; - spin_unlock_irqrestore(&tq->tq_lock, flags); - } -} - -/* - * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current - * number of threads is insufficient to handle the pending tasks. These - * new threads must be created by the dedicated dynamic_taskq to avoid - * deadlocks between thread creation and memory reclaim. The system_taskq - * which is also a dynamic taskq cannot be safely used for this. - */ -static int -taskq_thread_spawn(taskq_t *tq) -{ - int spawning = 0; - - if (!(tq->tq_flags & TASKQ_DYNAMIC)) - return (0); - - if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && - (tq->tq_flags & TASKQ_ACTIVE)) { - spawning = (++tq->tq_nspawn); - taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, - tq, TQ_NOSLEEP); - } - - return (spawning); -} - -/* - * Threads in a dynamic taskq should only exit once it has been completely - * drained and no other threads are actively servicing tasks. This prevents - * threads from being created and destroyed more than is required. - * - * The first thread is the thread list is treated as the primary thread. - * There is nothing special about the primary thread but in order to avoid - * all the taskq pids from changing we opt to make it long running. - */ -static int -taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) -{ - if (!(tq->tq_flags & TASKQ_DYNAMIC)) - return (0); - - if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, - tqt_thread_list) == tqt) - return (0); - - return - ((tq->tq_nspawn == 0) && /* No threads are being spawned */ - (tq->tq_nactive == 0) && /* No threads are handling tasks */ - (tq->tq_nthreads > 1) && /* More than 1 thread is running */ - (!taskq_next_ent(tq)) && /* There are no pending tasks */ - (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ -} - -static int -taskq_thread(void *args) -{ - DECLARE_WAITQUEUE(wait, current); - sigset_t blocked; - taskq_thread_t *tqt = args; - taskq_t *tq; - taskq_ent_t *t; - int seq_tasks = 0; - unsigned long flags; - taskq_ent_t dup_task = {}; - - ASSERT(tqt); - ASSERT(tqt->tqt_tq); - tq = tqt->tqt_tq; - current->flags |= PF_NOFREEZE; - - (void) spl_fstrans_mark(); - - sigfillset(&blocked); - sigprocmask(SIG_BLOCK, &blocked, NULL); - flush_signals(current); - kfpu_initialize(); - - tsd_set(taskq_tsd, tq); - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - /* - * If we are dynamically spawned, decrease spawning count. Note that - * we could be created during taskq_create, in which case we shouldn't - * do the decrement. But it's fine because taskq_create will reset - * tq_nspawn later. - */ - if (tq->tq_flags & TASKQ_DYNAMIC) - tq->tq_nspawn--; - - /* Immediately exit if more threads than allowed were created. */ - if (tq->tq_nthreads >= tq->tq_maxthreads) - goto error; - - tq->tq_nthreads++; - list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); - wake_up(&tq->tq_wait_waitq); - set_current_state(TASK_INTERRUPTIBLE); - - while (!kthread_should_stop()) { - - if (list_empty(&tq->tq_pend_list) && - list_empty(&tq->tq_prio_list)) { - - if (taskq_thread_should_stop(tq, tqt)) { - wake_up_all(&tq->tq_wait_waitq); - break; - } - - add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - schedule(); - seq_tasks = 0; - - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - remove_wait_queue(&tq->tq_work_waitq, &wait); - } else { - __set_current_state(TASK_RUNNING); - } - - if ((t = taskq_next_ent(tq)) != NULL) { - list_del_init(&t->tqent_list); - - /* - * A TQENT_FLAG_PREALLOC task may be reused or freed - * during the task function call. Store tqent_id and - * tqent_flags here. - * - * Also use an on stack taskq_ent_t for tqt_task - * assignment in this case. We only populate the two - * fields used by the only user in taskq proc file. - */ - tqt->tqt_id = t->tqent_id; - tqt->tqt_flags = t->tqent_flags; - - if (t->tqent_flags & TQENT_FLAG_PREALLOC) { - dup_task.tqent_func = t->tqent_func; - dup_task.tqent_arg = t->tqent_arg; - t = &dup_task; - } - tqt->tqt_task = t; - - taskq_insert_in_order(tq, tqt); - tq->tq_nactive++; - spin_unlock_irqrestore(&tq->tq_lock, flags); - - /* Perform the requested task */ - t->tqent_func(t->tqent_arg); - - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - tq->tq_nactive--; - list_del_init(&tqt->tqt_active_list); - tqt->tqt_task = NULL; - - /* For prealloc'd tasks, we don't free anything. */ - if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) - task_done(tq, t); - - /* - * When the current lowest outstanding taskqid is - * done calculate the new lowest outstanding id - */ - if (tq->tq_lowest_id == tqt->tqt_id) { - tq->tq_lowest_id = taskq_lowest_id(tq); - ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); - } - - /* Spawn additional taskq threads if required. */ - if ((++seq_tasks) > spl_taskq_thread_sequential && - taskq_thread_spawn(tq)) - seq_tasks = 0; - - tqt->tqt_id = TASKQID_INVALID; - tqt->tqt_flags = 0; - wake_up_all(&tq->tq_wait_waitq); - } else { - if (taskq_thread_should_stop(tq, tqt)) - break; - } - - set_current_state(TASK_INTERRUPTIBLE); - - } - - __set_current_state(TASK_RUNNING); - tq->tq_nthreads--; - list_del_init(&tqt->tqt_thread_list); -error: - kmem_free(tqt, sizeof (taskq_thread_t)); - spin_unlock_irqrestore(&tq->tq_lock, flags); - - tsd_set(taskq_tsd, NULL); - - return (0); -} - -static taskq_thread_t * -taskq_thread_create(taskq_t *tq) -{ - static int last_used_cpu = 0; - taskq_thread_t *tqt; - - tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); - INIT_LIST_HEAD(&tqt->tqt_thread_list); - INIT_LIST_HEAD(&tqt->tqt_active_list); - tqt->tqt_tq = tq; - tqt->tqt_id = TASKQID_INVALID; - - tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, - "%s", tq->tq_name); - if (tqt->tqt_thread == NULL) { - kmem_free(tqt, sizeof (taskq_thread_t)); - return (NULL); - } - - if (spl_taskq_thread_bind) { - last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); - kthread_bind(tqt->tqt_thread, last_used_cpu); - } - - if (spl_taskq_thread_priority) - set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); - - wake_up_process(tqt->tqt_thread); - - return (tqt); -} - -taskq_t * -taskq_create(const char *name, int nthreads, pri_t pri, - int minalloc, int maxalloc, uint_t flags) -{ - taskq_t *tq; - taskq_thread_t *tqt; - int count = 0, rc = 0, i; - unsigned long irqflags; - - ASSERT(name != NULL); - ASSERT(minalloc >= 0); - ASSERT(maxalloc <= INT_MAX); - ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ - - /* Scale the number of threads using nthreads as a percentage */ - if (flags & TASKQ_THREADS_CPU_PCT) { - ASSERT(nthreads <= 100); - ASSERT(nthreads >= 0); - nthreads = MIN(nthreads, 100); - nthreads = MAX(nthreads, 0); - nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); - } - - tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); - if (tq == NULL) - return (NULL); - - spin_lock_init(&tq->tq_lock); - INIT_LIST_HEAD(&tq->tq_thread_list); - INIT_LIST_HEAD(&tq->tq_active_list); - tq->tq_name = strdup(name); - tq->tq_nactive = 0; - tq->tq_nthreads = 0; - tq->tq_nspawn = 0; - tq->tq_maxthreads = nthreads; - tq->tq_pri = pri; - tq->tq_minalloc = minalloc; - tq->tq_maxalloc = maxalloc; - tq->tq_nalloc = 0; - tq->tq_flags = (flags | TASKQ_ACTIVE); - tq->tq_next_id = TASKQID_INITIAL; - tq->tq_lowest_id = TASKQID_INITIAL; - INIT_LIST_HEAD(&tq->tq_free_list); - INIT_LIST_HEAD(&tq->tq_pend_list); - INIT_LIST_HEAD(&tq->tq_prio_list); - INIT_LIST_HEAD(&tq->tq_delay_list); - init_waitqueue_head(&tq->tq_work_waitq); - init_waitqueue_head(&tq->tq_wait_waitq); - tq->tq_lock_class = TQ_LOCK_GENERAL; - INIT_LIST_HEAD(&tq->tq_taskqs); - - if (flags & TASKQ_PREPOPULATE) { - spin_lock_irqsave_nested(&tq->tq_lock, irqflags, - tq->tq_lock_class); - - for (i = 0; i < minalloc; i++) - task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, - &irqflags)); - - spin_unlock_irqrestore(&tq->tq_lock, irqflags); - } - - if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) - nthreads = 1; - - for (i = 0; i < nthreads; i++) { - tqt = taskq_thread_create(tq); - if (tqt == NULL) - rc = 1; - else - count++; - } - - /* Wait for all threads to be started before potential destroy */ - wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); - /* - * taskq_thread might have touched nspawn, but we don't want them to - * because they're not dynamically spawned. So we reset it to 0 - */ - tq->tq_nspawn = 0; - - if (rc) { - taskq_destroy(tq); - tq = NULL; - } else { - down_write(&tq_list_sem); - tq->tq_instance = taskq_find_by_name(name) + 1; - list_add_tail(&tq->tq_taskqs, &tq_list); - up_write(&tq_list_sem); - } - - return (tq); -} -EXPORT_SYMBOL(taskq_create); - -void -taskq_destroy(taskq_t *tq) -{ - struct task_struct *thread; - taskq_thread_t *tqt; - taskq_ent_t *t; - unsigned long flags; - - ASSERT(tq); - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - tq->tq_flags &= ~TASKQ_ACTIVE; - spin_unlock_irqrestore(&tq->tq_lock, flags); - - /* - * When TASKQ_ACTIVE is clear new tasks may not be added nor may - * new worker threads be spawned for dynamic taskq. - */ - if (dynamic_taskq != NULL) - taskq_wait_outstanding(dynamic_taskq, 0); - - taskq_wait(tq); - - /* remove taskq from global list used by the kstats */ - down_write(&tq_list_sem); - list_del(&tq->tq_taskqs); - up_write(&tq_list_sem); - - spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); - /* wait for spawning threads to insert themselves to the list */ - while (tq->tq_nspawn) { - spin_unlock_irqrestore(&tq->tq_lock, flags); - schedule_timeout_interruptible(1); - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - } - - /* - * Signal each thread to exit and block until it does. Each thread - * is responsible for removing itself from the list and freeing its - * taskq_thread_t. This allows for idle threads to opt to remove - * themselves from the taskq. They can be recreated as needed. - */ - while (!list_empty(&tq->tq_thread_list)) { - tqt = list_entry(tq->tq_thread_list.next, - taskq_thread_t, tqt_thread_list); - thread = tqt->tqt_thread; - spin_unlock_irqrestore(&tq->tq_lock, flags); - - kthread_stop(thread); - - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - } - - while (!list_empty(&tq->tq_free_list)) { - t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); - - ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); - - list_del_init(&t->tqent_list); - task_free(tq, t); - } - - ASSERT0(tq->tq_nthreads); - ASSERT0(tq->tq_nalloc); - ASSERT0(tq->tq_nspawn); - ASSERT(list_empty(&tq->tq_thread_list)); - ASSERT(list_empty(&tq->tq_active_list)); - ASSERT(list_empty(&tq->tq_free_list)); - ASSERT(list_empty(&tq->tq_pend_list)); - ASSERT(list_empty(&tq->tq_prio_list)); - ASSERT(list_empty(&tq->tq_delay_list)); - - spin_unlock_irqrestore(&tq->tq_lock, flags); - - strfree(tq->tq_name); - kmem_free(tq, sizeof (taskq_t)); -} -EXPORT_SYMBOL(taskq_destroy); - - -static unsigned int spl_taskq_kick = 0; - -/* - * 2.6.36 API Change - * module_param_cb is introduced to take kernel_param_ops and - * module_param_call is marked as obsolete. Also set and get operations - * were changed to take a 'const struct kernel_param *'. - */ -static int -#ifdef module_param_cb -param_set_taskq_kick(const char *val, const struct kernel_param *kp) -#else -param_set_taskq_kick(const char *val, struct kernel_param *kp) -#endif -{ - int ret; - taskq_t *tq; - taskq_ent_t *t; - unsigned long flags; - - ret = param_set_uint(val, kp); - if (ret < 0 || !spl_taskq_kick) - return (ret); - /* reset value */ - spl_taskq_kick = 0; - - down_read(&tq_list_sem); - list_for_each_entry(tq, &tq_list, tq_taskqs) { - spin_lock_irqsave_nested(&tq->tq_lock, flags, - tq->tq_lock_class); - /* Check if the first pending is older than 5 seconds */ - t = taskq_next_ent(tq); - if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { - (void) taskq_thread_spawn(tq); - printk(KERN_INFO "spl: Kicked taskq %s/%d\n", - tq->tq_name, tq->tq_instance); - } - spin_unlock_irqrestore(&tq->tq_lock, flags); - } - up_read(&tq_list_sem); - return (ret); -} - -#ifdef module_param_cb -static const struct kernel_param_ops param_ops_taskq_kick = { - .set = param_set_taskq_kick, - .get = param_get_uint, -}; -module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); -#else -module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, - &spl_taskq_kick, 0644); -#endif -MODULE_PARM_DESC(spl_taskq_kick, - "Write nonzero to kick stuck taskqs to spawn more threads"); - -int -spl_taskq_init(void) -{ - init_rwsem(&tq_list_sem); - tsd_create(&taskq_tsd, NULL); - - system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), - maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); - if (system_taskq == NULL) - return (1); - - system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), - maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); - if (system_delay_taskq == NULL) { - taskq_destroy(system_taskq); - return (1); - } - - dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, - maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); - if (dynamic_taskq == NULL) { - taskq_destroy(system_taskq); - taskq_destroy(system_delay_taskq); - return (1); - } - - /* - * This is used to annotate tq_lock, so - * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch - * does not trigger a lockdep warning re: possible recursive locking - */ - dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; - - return (0); -} - -void -spl_taskq_fini(void) -{ - taskq_destroy(dynamic_taskq); - dynamic_taskq = NULL; - - taskq_destroy(system_delay_taskq); - system_delay_taskq = NULL; - - taskq_destroy(system_taskq); - system_taskq = NULL; - - tsd_destroy(&taskq_tsd); -} diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c deleted file mode 100644 index 29de9252a..000000000 --- a/module/spl/spl-thread.c +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Thread Implementation. - */ - -#include <sys/thread.h> -#include <sys/kmem.h> -#include <sys/tsd.h> -#include <sys/simd.h> - -/* - * Thread interfaces - */ -typedef struct thread_priv_s { - unsigned long tp_magic; /* Magic */ - int tp_name_size; /* Name size */ - char *tp_name; /* Name (without _thread suffix) */ - void (*tp_func)(void *); /* Registered function */ - void *tp_args; /* Args to be passed to function */ - size_t tp_len; /* Len to be passed to function */ - int tp_state; /* State to start thread at */ - pri_t tp_pri; /* Priority to start threat at */ -} thread_priv_t; - -static int -thread_generic_wrapper(void *arg) -{ - thread_priv_t *tp = (thread_priv_t *)arg; - void (*func)(void *); - void *args; - - ASSERT(tp->tp_magic == TP_MAGIC); - func = tp->tp_func; - args = tp->tp_args; - set_current_state(tp->tp_state); - set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); - kfpu_initialize(); - kmem_free(tp->tp_name, tp->tp_name_size); - kmem_free(tp, sizeof (thread_priv_t)); - - if (func) - func(args); - - return (0); -} - -void -__thread_exit(void) -{ - tsd_exit(); - complete_and_exit(NULL, 0); - /* Unreachable */ -} -EXPORT_SYMBOL(__thread_exit); - -/* - * thread_create() may block forever if it cannot create a thread or - * allocate memory. This is preferable to returning a NULL which Solaris - * style callers likely never check for... since it can't fail. - */ -kthread_t * -__thread_create(caddr_t stk, size_t stksize, thread_func_t func, - const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) -{ - thread_priv_t *tp; - struct task_struct *tsk; - char *p; - - /* Option pp is simply ignored */ - /* Variable stack size unsupported */ - ASSERT(stk == NULL); - - tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); - if (tp == NULL) - return (NULL); - - tp->tp_magic = TP_MAGIC; - tp->tp_name_size = strlen(name) + 1; - - tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); - if (tp->tp_name == NULL) { - kmem_free(tp, sizeof (thread_priv_t)); - return (NULL); - } - - strncpy(tp->tp_name, name, tp->tp_name_size); - - /* - * Strip trailing "_thread" from passed name which will be the func - * name since the exposed API has no parameter for passing a name. - */ - p = strstr(tp->tp_name, "_thread"); - if (p) - p[0] = '\0'; - - tp->tp_func = func; - tp->tp_args = args; - tp->tp_len = len; - tp->tp_state = state; - tp->tp_pri = pri; - - tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, - "%s", tp->tp_name); - if (IS_ERR(tsk)) - return (NULL); - - wake_up_process(tsk); - return ((kthread_t *)tsk); -} -EXPORT_SYMBOL(__thread_create); - -/* - * spl_kthread_create - Wrapper providing pre-3.13 semantics for - * kthread_create() in which it is not killable and less likely - * to return -ENOMEM. - */ -struct task_struct * -spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) -{ - struct task_struct *tsk; - va_list args; - char name[TASK_COMM_LEN]; - - va_start(args, namefmt); - vsnprintf(name, sizeof (name), namefmt, args); - va_end(args); - do { - tsk = kthread_create(func, data, "%s", name); - if (IS_ERR(tsk)) { - if (signal_pending(current)) { - clear_thread_flag(TIF_SIGPENDING); - continue; - } - if (PTR_ERR(tsk) == -ENOMEM) - continue; - return (NULL); - } else { - return (tsk); - } - } while (1); -} -EXPORT_SYMBOL(spl_kthread_create); diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c deleted file mode 100644 index 14342d5a6..000000000 --- a/module/spl/spl-tsd.c +++ /dev/null @@ -1,720 +0,0 @@ -/* - * Copyright (C) 2010 Lawrence Livermore National Security, LLC. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * - * Solaris Porting Layer (SPL) Thread Specific Data Implementation. - * - * Thread specific data has implemented using a hash table, this avoids - * the need to add a member to the task structure and allows maximum - * portability between kernels. This implementation has been optimized - * to keep the tsd_set() and tsd_get() times as small as possible. - * - * The majority of the entries in the hash table are for specific tsd - * entries. These entries are hashed by the product of their key and - * pid because by design the key and pid are guaranteed to be unique. - * Their product also has the desirable properly that it will be uniformly - * distributed over the hash bins providing neither the pid nor key is zero. - * Under linux the zero pid is always the init process and thus won't be - * used, and this implementation is careful to never to assign a zero key. - * By default the hash table is sized to 512 bins which is expected to - * be sufficient for light to moderate usage of thread specific data. - * - * The hash table contains two additional type of entries. They first - * type is entry is called a 'key' entry and it is added to the hash during - * tsd_create(). It is used to store the address of the destructor function - * and it is used as an anchor point. All tsd entries which use the same - * key will be linked to this entry. This is used during tsd_destroy() to - * quickly call the destructor function for all tsd associated with the key. - * The 'key' entry may be looked up with tsd_hash_search() by passing the - * key you wish to lookup and DTOR_PID constant as the pid. - * - * The second type of entry is called a 'pid' entry and it is added to the - * hash the first time a process set a key. The 'pid' entry is also used - * as an anchor and all tsd for the process will be linked to it. This - * list is using during tsd_exit() to ensure all registered destructors - * are run for the process. The 'pid' entry may be looked up with - * tsd_hash_search() by passing the PID_KEY constant as the key, and - * the process pid. Note that tsd_exit() is called by thread_exit() - * so if your using the Solaris thread API you should not need to call - * tsd_exit() directly. - * - */ - -#include <sys/kmem.h> -#include <sys/thread.h> -#include <sys/tsd.h> -#include <linux/hash.h> - -typedef struct tsd_hash_bin { - spinlock_t hb_lock; - struct hlist_head hb_head; -} tsd_hash_bin_t; - -typedef struct tsd_hash_table { - spinlock_t ht_lock; - uint_t ht_bits; - uint_t ht_key; - tsd_hash_bin_t *ht_bins; -} tsd_hash_table_t; - -typedef struct tsd_hash_entry { - uint_t he_key; - pid_t he_pid; - dtor_func_t he_dtor; - void *he_value; - struct hlist_node he_list; - struct list_head he_key_list; - struct list_head he_pid_list; -} tsd_hash_entry_t; - -static tsd_hash_table_t *tsd_hash_table = NULL; - - -/* - * tsd_hash_search - searches hash table for tsd_hash_entry - * @table: hash table - * @key: search key - * @pid: search pid - */ -static tsd_hash_entry_t * -tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) -{ - struct hlist_node *node; - tsd_hash_entry_t *entry; - tsd_hash_bin_t *bin; - ulong_t hash; - - hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); - bin = &table->ht_bins[hash]; - spin_lock(&bin->hb_lock); - hlist_for_each(node, &bin->hb_head) { - entry = list_entry(node, tsd_hash_entry_t, he_list); - if ((entry->he_key == key) && (entry->he_pid == pid)) { - spin_unlock(&bin->hb_lock); - return (entry); - } - } - - spin_unlock(&bin->hb_lock); - return (NULL); -} - -/* - * tsd_hash_dtor - call the destructor and free all entries on the list - * @work: list of hash entries - * - * For a list of entries which have all already been removed from the - * hash call their registered destructor then free the associated memory. - */ -static void -tsd_hash_dtor(struct hlist_head *work) -{ - tsd_hash_entry_t *entry; - - while (!hlist_empty(work)) { - entry = hlist_entry(work->first, tsd_hash_entry_t, he_list); - hlist_del(&entry->he_list); - - if (entry->he_dtor && entry->he_pid != DTOR_PID) - entry->he_dtor(entry->he_value); - - kmem_free(entry, sizeof (tsd_hash_entry_t)); - } -} - -/* - * tsd_hash_add - adds an entry to hash table - * @table: hash table - * @key: search key - * @pid: search pid - * - * The caller is responsible for ensuring the unique key/pid do not - * already exist in the hash table. This possible because all entries - * are thread specific thus a concurrent thread will never attempt to - * add this key/pid. Because multiple bins must be checked to add - * links to the dtor and pid entries the entire table is locked. - */ -static int -tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) -{ - tsd_hash_entry_t *entry, *dtor_entry, *pid_entry; - tsd_hash_bin_t *bin; - ulong_t hash; - int rc = 0; - - ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); - - /* New entry allocate structure, set value, and add to hash */ - entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); - if (entry == NULL) - return (ENOMEM); - - entry->he_key = key; - entry->he_pid = pid; - entry->he_value = value; - INIT_HLIST_NODE(&entry->he_list); - INIT_LIST_HEAD(&entry->he_key_list); - INIT_LIST_HEAD(&entry->he_pid_list); - - spin_lock(&table->ht_lock); - - /* Destructor entry must exist for all valid keys */ - dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID); - ASSERT3P(dtor_entry, !=, NULL); - entry->he_dtor = dtor_entry->he_dtor; - - /* Process entry must exist for all valid processes */ - pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid); - ASSERT3P(pid_entry, !=, NULL); - - hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); - bin = &table->ht_bins[hash]; - spin_lock(&bin->hb_lock); - - /* Add to the hash, key, and pid lists */ - hlist_add_head(&entry->he_list, &bin->hb_head); - list_add(&entry->he_key_list, &dtor_entry->he_key_list); - list_add(&entry->he_pid_list, &pid_entry->he_pid_list); - - spin_unlock(&bin->hb_lock); - spin_unlock(&table->ht_lock); - - return (rc); -} - -/* - * tsd_hash_add_key - adds a destructor entry to the hash table - * @table: hash table - * @keyp: search key - * @dtor: key destructor - * - * For every unique key there is a single entry in the hash which is used - * as anchor. All other thread specific entries for this key are linked - * to this anchor via the 'he_key_list' list head. On return they keyp - * will be set to the next available key for the hash table. - */ -static int -tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) -{ - tsd_hash_entry_t *tmp_entry, *entry; - tsd_hash_bin_t *bin; - ulong_t hash; - int keys_checked = 0; - - ASSERT3P(table, !=, NULL); - - /* Allocate entry to be used as a destructor for this key */ - entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); - if (entry == NULL) - return (ENOMEM); - - /* Determine next available key value */ - spin_lock(&table->ht_lock); - do { - /* Limited to TSD_KEYS_MAX concurrent unique keys */ - if (table->ht_key++ > TSD_KEYS_MAX) - table->ht_key = 1; - - /* Ensure failure when all TSD_KEYS_MAX keys are in use */ - if (keys_checked++ >= TSD_KEYS_MAX) { - spin_unlock(&table->ht_lock); - return (ENOENT); - } - - tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID); - } while (tmp_entry); - - /* Add destructor entry in to hash table */ - entry->he_key = *keyp = table->ht_key; - entry->he_pid = DTOR_PID; - entry->he_dtor = dtor; - entry->he_value = NULL; - INIT_HLIST_NODE(&entry->he_list); - INIT_LIST_HEAD(&entry->he_key_list); - INIT_LIST_HEAD(&entry->he_pid_list); - - hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits); - bin = &table->ht_bins[hash]; - spin_lock(&bin->hb_lock); - - hlist_add_head(&entry->he_list, &bin->hb_head); - - spin_unlock(&bin->hb_lock); - spin_unlock(&table->ht_lock); - - return (0); -} - -/* - * tsd_hash_add_pid - adds a process entry to the hash table - * @table: hash table - * @pid: search pid - * - * For every process there is a single entry in the hash which is used - * as anchor. All other thread specific entries for this process are - * linked to this anchor via the 'he_pid_list' list head. - */ -static int -tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid) -{ - tsd_hash_entry_t *entry; - tsd_hash_bin_t *bin; - ulong_t hash; - - /* Allocate entry to be used as the process reference */ - entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); - if (entry == NULL) - return (ENOMEM); - - spin_lock(&table->ht_lock); - entry->he_key = PID_KEY; - entry->he_pid = pid; - entry->he_dtor = NULL; - entry->he_value = NULL; - INIT_HLIST_NODE(&entry->he_list); - INIT_LIST_HEAD(&entry->he_key_list); - INIT_LIST_HEAD(&entry->he_pid_list); - - hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits); - bin = &table->ht_bins[hash]; - spin_lock(&bin->hb_lock); - - hlist_add_head(&entry->he_list, &bin->hb_head); - - spin_unlock(&bin->hb_lock); - spin_unlock(&table->ht_lock); - - return (0); -} - -/* - * tsd_hash_del - delete an entry from hash table, key, and pid lists - * @table: hash table - * @key: search key - * @pid: search pid - */ -static void -tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry) -{ - hlist_del(&entry->he_list); - list_del_init(&entry->he_key_list); - list_del_init(&entry->he_pid_list); -} - -/* - * tsd_hash_table_init - allocate a hash table - * @bits: hash table size - * - * A hash table with 2^bits bins will be created, it may not be resized - * after the fact and must be free'd with tsd_hash_table_fini(). - */ -static tsd_hash_table_t * -tsd_hash_table_init(uint_t bits) -{ - tsd_hash_table_t *table; - int hash, size = (1 << bits); - - table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP); - if (table == NULL) - return (NULL); - - table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP); - if (table->ht_bins == NULL) { - kmem_free(table, sizeof (tsd_hash_table_t)); - return (NULL); - } - - for (hash = 0; hash < size; hash++) { - spin_lock_init(&table->ht_bins[hash].hb_lock); - INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head); - } - - spin_lock_init(&table->ht_lock); - table->ht_bits = bits; - table->ht_key = 1; - - return (table); -} - -/* - * tsd_hash_table_fini - free a hash table - * @table: hash table - * - * Free a hash table allocated by tsd_hash_table_init(). If the hash - * table is not empty this function will call the proper destructor for - * all remaining entries before freeing the memory used by those entries. - */ -static void -tsd_hash_table_fini(tsd_hash_table_t *table) -{ - HLIST_HEAD(work); - tsd_hash_bin_t *bin; - tsd_hash_entry_t *entry; - int size, i; - - ASSERT3P(table, !=, NULL); - spin_lock(&table->ht_lock); - for (i = 0, size = (1 << table->ht_bits); i < size; i++) { - bin = &table->ht_bins[i]; - spin_lock(&bin->hb_lock); - while (!hlist_empty(&bin->hb_head)) { - entry = hlist_entry(bin->hb_head.first, - tsd_hash_entry_t, he_list); - tsd_hash_del(table, entry); - hlist_add_head(&entry->he_list, &work); - } - spin_unlock(&bin->hb_lock); - } - spin_unlock(&table->ht_lock); - - tsd_hash_dtor(&work); - kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits)); - kmem_free(table, sizeof (tsd_hash_table_t)); -} - -/* - * tsd_remove_entry - remove a tsd entry for this thread - * @entry: entry to remove - * - * Remove the thread specific data @entry for this thread. - * If this is the last entry for this thread, also remove the PID entry. - */ -static void -tsd_remove_entry(tsd_hash_entry_t *entry) -{ - HLIST_HEAD(work); - tsd_hash_table_t *table; - tsd_hash_entry_t *pid_entry; - tsd_hash_bin_t *pid_entry_bin, *entry_bin; - ulong_t hash; - - table = tsd_hash_table; - ASSERT3P(table, !=, NULL); - ASSERT3P(entry, !=, NULL); - - spin_lock(&table->ht_lock); - - hash = hash_long((ulong_t)entry->he_key * - (ulong_t)entry->he_pid, table->ht_bits); - entry_bin = &table->ht_bins[hash]; - - /* save the possible pid_entry */ - pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t, - he_pid_list); - - /* remove entry */ - spin_lock(&entry_bin->hb_lock); - tsd_hash_del(table, entry); - hlist_add_head(&entry->he_list, &work); - spin_unlock(&entry_bin->hb_lock); - - /* if pid_entry is indeed pid_entry, then remove it if it's empty */ - if (pid_entry->he_key == PID_KEY && - list_empty(&pid_entry->he_pid_list)) { - hash = hash_long((ulong_t)pid_entry->he_key * - (ulong_t)pid_entry->he_pid, table->ht_bits); - pid_entry_bin = &table->ht_bins[hash]; - - spin_lock(&pid_entry_bin->hb_lock); - tsd_hash_del(table, pid_entry); - hlist_add_head(&pid_entry->he_list, &work); - spin_unlock(&pid_entry_bin->hb_lock); - } - - spin_unlock(&table->ht_lock); - - tsd_hash_dtor(&work); -} - -/* - * tsd_set - set thread specific data - * @key: lookup key - * @value: value to set - * - * Caller must prevent racing tsd_create() or tsd_destroy(), protected - * from racing tsd_get() or tsd_set() because it is thread specific. - * This function has been optimized to be fast for the update case. - * When setting the tsd initially it will be slower due to additional - * required locking and potential memory allocations. - */ -int -tsd_set(uint_t key, void *value) -{ - tsd_hash_table_t *table; - tsd_hash_entry_t *entry; - pid_t pid; - int rc; - /* mark remove if value is NULL */ - boolean_t remove = (value == NULL); - - table = tsd_hash_table; - pid = curthread->pid; - ASSERT3P(table, !=, NULL); - - if ((key == 0) || (key > TSD_KEYS_MAX)) - return (EINVAL); - - /* Entry already exists in hash table update value */ - entry = tsd_hash_search(table, key, pid); - if (entry) { - entry->he_value = value; - /* remove the entry */ - if (remove) - tsd_remove_entry(entry); - return (0); - } - - /* don't create entry if value is NULL */ - if (remove) - return (0); - - /* Add a process entry to the hash if not yet exists */ - entry = tsd_hash_search(table, PID_KEY, pid); - if (entry == NULL) { - rc = tsd_hash_add_pid(table, pid); - if (rc) - return (rc); - } - - rc = tsd_hash_add(table, key, pid, value); - return (rc); -} -EXPORT_SYMBOL(tsd_set); - -/* - * tsd_get - get thread specific data - * @key: lookup key - * - * Caller must prevent racing tsd_create() or tsd_destroy(). This - * implementation is designed to be fast and scalable, it does not - * lock the entire table only a single hash bin. - */ -void * -tsd_get(uint_t key) -{ - tsd_hash_entry_t *entry; - - ASSERT3P(tsd_hash_table, !=, NULL); - - if ((key == 0) || (key > TSD_KEYS_MAX)) - return (NULL); - - entry = tsd_hash_search(tsd_hash_table, key, curthread->pid); - if (entry == NULL) - return (NULL); - - return (entry->he_value); -} -EXPORT_SYMBOL(tsd_get); - -/* - * tsd_get_by_thread - get thread specific data for specified thread - * @key: lookup key - * @thread: thread to lookup - * - * Caller must prevent racing tsd_create() or tsd_destroy(). This - * implementation is designed to be fast and scalable, it does not - * lock the entire table only a single hash bin. - */ -void * -tsd_get_by_thread(uint_t key, kthread_t *thread) -{ - tsd_hash_entry_t *entry; - - ASSERT3P(tsd_hash_table, !=, NULL); - - if ((key == 0) || (key > TSD_KEYS_MAX)) - return (NULL); - - entry = tsd_hash_search(tsd_hash_table, key, thread->pid); - if (entry == NULL) - return (NULL); - - return (entry->he_value); -} -EXPORT_SYMBOL(tsd_get_by_thread); - -/* - * tsd_create - create thread specific data key - * @keyp: lookup key address - * @dtor: destructor called during tsd_destroy() or tsd_exit() - * - * Provided key must be set to 0 or it assumed to be already in use. - * The dtor is allowed to be NULL in which case no additional cleanup - * for the data is performed during tsd_destroy() or tsd_exit(). - * - * Caller must prevent racing tsd_set() or tsd_get(), this function is - * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). - */ -void -tsd_create(uint_t *keyp, dtor_func_t dtor) -{ - ASSERT3P(keyp, !=, NULL); - if (*keyp) - return; - - (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor); -} -EXPORT_SYMBOL(tsd_create); - -/* - * tsd_destroy - destroy thread specific data - * @keyp: lookup key address - * - * Destroys the thread specific data on all threads which use this key. - * - * Caller must prevent racing tsd_set() or tsd_get(), this function is - * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). - */ -void -tsd_destroy(uint_t *keyp) -{ - HLIST_HEAD(work); - tsd_hash_table_t *table; - tsd_hash_entry_t *dtor_entry, *entry; - tsd_hash_bin_t *dtor_entry_bin, *entry_bin; - ulong_t hash; - - table = tsd_hash_table; - ASSERT3P(table, !=, NULL); - - spin_lock(&table->ht_lock); - dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID); - if (dtor_entry == NULL) { - spin_unlock(&table->ht_lock); - return; - } - - /* - * All threads which use this key must be linked off of the - * DTOR_PID entry. They are removed from the hash table and - * linked in to a private working list to be destroyed. - */ - while (!list_empty(&dtor_entry->he_key_list)) { - entry = list_entry(dtor_entry->he_key_list.next, - tsd_hash_entry_t, he_key_list); - ASSERT3U(dtor_entry->he_key, ==, entry->he_key); - ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor); - - hash = hash_long((ulong_t)entry->he_key * - (ulong_t)entry->he_pid, table->ht_bits); - entry_bin = &table->ht_bins[hash]; - - spin_lock(&entry_bin->hb_lock); - tsd_hash_del(table, entry); - hlist_add_head(&entry->he_list, &work); - spin_unlock(&entry_bin->hb_lock); - } - - hash = hash_long((ulong_t)dtor_entry->he_key * - (ulong_t)dtor_entry->he_pid, table->ht_bits); - dtor_entry_bin = &table->ht_bins[hash]; - - spin_lock(&dtor_entry_bin->hb_lock); - tsd_hash_del(table, dtor_entry); - hlist_add_head(&dtor_entry->he_list, &work); - spin_unlock(&dtor_entry_bin->hb_lock); - spin_unlock(&table->ht_lock); - - tsd_hash_dtor(&work); - *keyp = 0; -} -EXPORT_SYMBOL(tsd_destroy); - -/* - * tsd_exit - destroys all thread specific data for this thread - * - * Destroys all the thread specific data for this thread. - * - * Caller must prevent racing tsd_set() or tsd_get(), this function is - * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). - */ -void -tsd_exit(void) -{ - HLIST_HEAD(work); - tsd_hash_table_t *table; - tsd_hash_entry_t *pid_entry, *entry; - tsd_hash_bin_t *pid_entry_bin, *entry_bin; - ulong_t hash; - - table = tsd_hash_table; - ASSERT3P(table, !=, NULL); - - spin_lock(&table->ht_lock); - pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid); - if (pid_entry == NULL) { - spin_unlock(&table->ht_lock); - return; - } - - /* - * All keys associated with this pid must be linked off of the - * PID_KEY entry. They are removed from the hash table and - * linked in to a private working list to be destroyed. - */ - - while (!list_empty(&pid_entry->he_pid_list)) { - entry = list_entry(pid_entry->he_pid_list.next, - tsd_hash_entry_t, he_pid_list); - ASSERT3U(pid_entry->he_pid, ==, entry->he_pid); - - hash = hash_long((ulong_t)entry->he_key * - (ulong_t)entry->he_pid, table->ht_bits); - entry_bin = &table->ht_bins[hash]; - - spin_lock(&entry_bin->hb_lock); - tsd_hash_del(table, entry); - hlist_add_head(&entry->he_list, &work); - spin_unlock(&entry_bin->hb_lock); - } - - hash = hash_long((ulong_t)pid_entry->he_key * - (ulong_t)pid_entry->he_pid, table->ht_bits); - pid_entry_bin = &table->ht_bins[hash]; - - spin_lock(&pid_entry_bin->hb_lock); - tsd_hash_del(table, pid_entry); - hlist_add_head(&pid_entry->he_list, &work); - spin_unlock(&pid_entry_bin->hb_lock); - spin_unlock(&table->ht_lock); - - tsd_hash_dtor(&work); -} -EXPORT_SYMBOL(tsd_exit); - -int -spl_tsd_init(void) -{ - tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); - if (tsd_hash_table == NULL) - return (1); - - return (0); -} - -void -spl_tsd_fini(void) -{ - tsd_hash_table_fini(tsd_hash_table); - tsd_hash_table = NULL; -} diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c deleted file mode 100644 index e1a84a911..000000000 --- a/module/spl/spl-vmem.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - */ - -#include <sys/debug.h> -#include <sys/vmem.h> -#include <sys/kmem_cache.h> -#include <sys/shrinker.h> -#include <linux/module.h> - -vmem_t *heap_arena = NULL; -EXPORT_SYMBOL(heap_arena); - -vmem_t *zio_alloc_arena = NULL; -EXPORT_SYMBOL(zio_alloc_arena); - -vmem_t *zio_arena = NULL; -EXPORT_SYMBOL(zio_arena); - -#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */ - -/* - * Return approximate virtual memory usage based on these assumptions: - * - * 1) The major SPL consumer of virtual memory is the kmem cache. - * 2) Memory allocated with vmem_alloc() is short lived and can be ignored. - * 3) Allow a 4MB floor as a generous pad given normal consumption. - * 4) The spl_kmem_cache_sem only contends with cache create/destroy. - */ -size_t -vmem_size(vmem_t *vmp, int typemask) -{ - spl_kmem_cache_t *skc; - size_t alloc = VMEM_FLOOR_SIZE; - - if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE)) - return (VMALLOC_TOTAL); - - - down_read(&spl_kmem_cache_sem); - list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { - if (skc->skc_flags & KMC_VMEM) - alloc += skc->skc_slab_size * skc->skc_slab_total; - } - up_read(&spl_kmem_cache_sem); - - if (typemask & VMEM_ALLOC) - return (MIN(alloc, VMALLOC_TOTAL)); - else if (typemask & VMEM_FREE) - return (MAX(VMALLOC_TOTAL - alloc, 0)); - else - return (0); -} -EXPORT_SYMBOL(vmem_size); - -/* - * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. - */ -void * -spl_vmem_alloc(size_t size, int flags, const char *func, int line) -{ - ASSERT0(flags & ~KM_PUBLIC_MASK); - - flags |= KM_VMEM; - -#if !defined(DEBUG_KMEM) - return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); -#elif !defined(DEBUG_KMEM_TRACKING) - return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); -#else - return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); -#endif -} -EXPORT_SYMBOL(spl_vmem_alloc); - -void * -spl_vmem_zalloc(size_t size, int flags, const char *func, int line) -{ - ASSERT0(flags & ~KM_PUBLIC_MASK); - - flags |= (KM_VMEM | KM_ZERO); - -#if !defined(DEBUG_KMEM) - return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); -#elif !defined(DEBUG_KMEM_TRACKING) - return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); -#else - return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); -#endif -} -EXPORT_SYMBOL(spl_vmem_zalloc); - -void -spl_vmem_free(const void *buf, size_t size) -{ -#if !defined(DEBUG_KMEM) - return (spl_kmem_free_impl(buf, size)); -#elif !defined(DEBUG_KMEM_TRACKING) - return (spl_kmem_free_debug(buf, size)); -#else - return (spl_kmem_free_track(buf, size)); -#endif -} -EXPORT_SYMBOL(spl_vmem_free); - -int -spl_vmem_init(void) -{ - return (0); -} - -void -spl_vmem_fini(void) -{ -} diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c deleted file mode 100644 index d9056c964..000000000 --- a/module/spl/spl-vnode.c +++ /dev/null @@ -1,719 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) Vnode Implementation. - */ - -#include <sys/cred.h> -#include <sys/vnode.h> -#include <sys/kmem_cache.h> -#include <linux/falloc.h> -#include <linux/fs.h> -#include <linux/uaccess.h> -#ifdef HAVE_FDTABLE_HEADER -#include <linux/fdtable.h> -#endif - -vnode_t *rootdir = (vnode_t *)0xabcd1234; -EXPORT_SYMBOL(rootdir); - -static spl_kmem_cache_t *vn_cache; -static spl_kmem_cache_t *vn_file_cache; - -static spinlock_t vn_file_lock; -static LIST_HEAD(vn_file_list); - -static int -spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) -{ - int error = -EOPNOTSUPP; - -#ifdef HAVE_FILE_FALLOCATE - if (fp->f_op->fallocate) - error = fp->f_op->fallocate(fp, mode, offset, len); -#else -#ifdef HAVE_INODE_FALLOCATE - if (fp->f_dentry && fp->f_dentry->d_inode && - fp->f_dentry->d_inode->i_op->fallocate) - error = fp->f_dentry->d_inode->i_op->fallocate( - fp->f_dentry->d_inode, mode, offset, len); -#endif /* HAVE_INODE_FALLOCATE */ -#endif /* HAVE_FILE_FALLOCATE */ - - return (error); -} - -static int -spl_filp_fsync(struct file *fp, int sync) -{ -#ifdef HAVE_2ARGS_VFS_FSYNC - return (vfs_fsync(fp, sync)); -#else - return (vfs_fsync(fp, (fp)->f_dentry, sync)); -#endif /* HAVE_2ARGS_VFS_FSYNC */ -} - -static ssize_t -spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) -{ -#if defined(HAVE_KERNEL_WRITE_PPOS) - return (kernel_write(file, buf, count, pos)); -#else - mm_segment_t saved_fs; - ssize_t ret; - - saved_fs = get_fs(); - set_fs(KERNEL_DS); - - ret = vfs_write(file, (__force const char __user *)buf, count, pos); - - set_fs(saved_fs); - - return (ret); -#endif -} - -static ssize_t -spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) -{ -#if defined(HAVE_KERNEL_READ_PPOS) - return (kernel_read(file, buf, count, pos)); -#else - mm_segment_t saved_fs; - ssize_t ret; - - saved_fs = get_fs(); - set_fs(KERNEL_DS); - - ret = vfs_read(file, (void __user *)buf, count, pos); - - set_fs(saved_fs); - - return (ret); -#endif -} - -vtype_t -vn_mode_to_vtype(mode_t mode) -{ - if (S_ISREG(mode)) - return (VREG); - - if (S_ISDIR(mode)) - return (VDIR); - - if (S_ISCHR(mode)) - return (VCHR); - - if (S_ISBLK(mode)) - return (VBLK); - - if (S_ISFIFO(mode)) - return (VFIFO); - - if (S_ISLNK(mode)) - return (VLNK); - - if (S_ISSOCK(mode)) - return (VSOCK); - - return (VNON); -} /* vn_mode_to_vtype() */ -EXPORT_SYMBOL(vn_mode_to_vtype); - -mode_t -vn_vtype_to_mode(vtype_t vtype) -{ - if (vtype == VREG) - return (S_IFREG); - - if (vtype == VDIR) - return (S_IFDIR); - - if (vtype == VCHR) - return (S_IFCHR); - - if (vtype == VBLK) - return (S_IFBLK); - - if (vtype == VFIFO) - return (S_IFIFO); - - if (vtype == VLNK) - return (S_IFLNK); - - if (vtype == VSOCK) - return (S_IFSOCK); - - return (VNON); -} /* vn_vtype_to_mode() */ -EXPORT_SYMBOL(vn_vtype_to_mode); - -vnode_t * -vn_alloc(int flag) -{ - vnode_t *vp; - - vp = kmem_cache_alloc(vn_cache, flag); - if (vp != NULL) { - vp->v_file = NULL; - vp->v_type = 0; - } - - return (vp); -} /* vn_alloc() */ -EXPORT_SYMBOL(vn_alloc); - -void -vn_free(vnode_t *vp) -{ - kmem_cache_free(vn_cache, vp); -} /* vn_free() */ -EXPORT_SYMBOL(vn_free); - -int -vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp, - int x1, void *x2) -{ - struct file *fp; - struct kstat stat; - int rc, saved_umask = 0; - gfp_t saved_gfp; - vnode_t *vp; - - ASSERT(flags & (FWRITE | FREAD)); - ASSERT(seg == UIO_SYSSPACE); - ASSERT(vpp); - *vpp = NULL; - - if (!(flags & FCREAT) && (flags & FWRITE)) - flags |= FEXCL; - - /* - * Note for filp_open() the two low bits must be remapped to mean: - * 01 - read-only -> 00 read-only - * 10 - write-only -> 01 write-only - * 11 - read-write -> 10 read-write - */ - flags--; - - if (flags & FCREAT) - saved_umask = xchg(¤t->fs->umask, 0); - - fp = filp_open(path, flags, mode); - - if (flags & FCREAT) - (void) xchg(¤t->fs->umask, saved_umask); - - if (IS_ERR(fp)) - return (-PTR_ERR(fp)); - -#if defined(HAVE_4ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); -#elif defined(HAVE_2ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat); -#else - rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); -#endif - if (rc) { - filp_close(fp, 0); - return (-rc); - } - - vp = vn_alloc(KM_SLEEP); - if (!vp) { - filp_close(fp, 0); - return (ENOMEM); - } - - saved_gfp = mapping_gfp_mask(fp->f_mapping); - mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS)); - - mutex_enter(&vp->v_lock); - vp->v_type = vn_mode_to_vtype(stat.mode); - vp->v_file = fp; - vp->v_gfp_mask = saved_gfp; - *vpp = vp; - mutex_exit(&vp->v_lock); - - return (0); -} /* vn_open() */ -EXPORT_SYMBOL(vn_open); - -int -vn_openat(const char *path, uio_seg_t seg, int flags, int mode, - vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd) -{ - char *realpath; - int len, rc; - - ASSERT(vp == rootdir); - - len = strlen(path) + 2; - realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP)); - if (!realpath) - return (ENOMEM); - - (void) snprintf(realpath, len, "/%s", path); - rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2); - kfree(realpath); - - return (rc); -} /* vn_openat() */ -EXPORT_SYMBOL(vn_openat); - -int -vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off, - uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp) -{ - struct file *fp = vp->v_file; - loff_t offset = off; - int rc; - - ASSERT(uio == UIO_WRITE || uio == UIO_READ); - ASSERT(seg == UIO_SYSSPACE); - ASSERT((ioflag & ~FAPPEND) == 0); - - if (ioflag & FAPPEND) - offset = fp->f_pos; - - if (uio & UIO_WRITE) - rc = spl_kernel_write(fp, addr, len, &offset); - else - rc = spl_kernel_read(fp, addr, len, &offset); - - fp->f_pos = offset; - - if (rc < 0) - return (-rc); - - if (residp) { - *residp = len - rc; - } else { - if (rc != len) - return (EIO); - } - - return (0); -} /* vn_rdwr() */ -EXPORT_SYMBOL(vn_rdwr); - -int -vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4) -{ - int rc; - - ASSERT(vp); - ASSERT(vp->v_file); - - mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask); - rc = filp_close(vp->v_file, 0); - vn_free(vp); - - return (-rc); -} /* vn_close() */ -EXPORT_SYMBOL(vn_close); - -/* - * vn_seek() does not actually seek it only performs bounds checking on the - * proposed seek. We perform minimal checking and allow vn_rdwr() to catch - * anything more serious. - */ -int -vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct) -{ - return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); -} -EXPORT_SYMBOL(vn_seek); - -int -vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4) -{ - struct file *fp; - struct kstat stat; - int rc; - - ASSERT(vp); - ASSERT(vp->v_file); - ASSERT(vap); - - fp = vp->v_file; - -#if defined(HAVE_4ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS, - AT_STATX_SYNC_AS_STAT); -#elif defined(HAVE_2ARGS_VFS_GETATTR) - rc = vfs_getattr(&fp->f_path, &stat); -#else - rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); -#endif - if (rc) - return (-rc); - - vap->va_type = vn_mode_to_vtype(stat.mode); - vap->va_mode = stat.mode; - vap->va_uid = KUID_TO_SUID(stat.uid); - vap->va_gid = KGID_TO_SGID(stat.gid); - vap->va_fsid = 0; - vap->va_nodeid = stat.ino; - vap->va_nlink = stat.nlink; - vap->va_size = stat.size; - vap->va_blksize = stat.blksize; - vap->va_atime = stat.atime; - vap->va_mtime = stat.mtime; - vap->va_ctime = stat.ctime; - vap->va_rdev = stat.rdev; - vap->va_nblocks = stat.blocks; - - return (0); -} -EXPORT_SYMBOL(vn_getattr); - -int -vn_fsync(vnode_t *vp, int flags, void *x3, void *x4) -{ - int datasync = 0; - int error; - int fstrans; - - ASSERT(vp); - ASSERT(vp->v_file); - - if (flags & FDSYNC) - datasync = 1; - - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - error = -spl_filp_fsync(vp->v_file, datasync); - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - - return (error); -} /* vn_fsync() */ -EXPORT_SYMBOL(vn_fsync); - -int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, - offset_t offset, void *x6, void *x7) -{ - int error = EOPNOTSUPP; -#ifdef FALLOC_FL_PUNCH_HOLE - int fstrans; -#endif - - if (cmd != F_FREESP || bfp->l_whence != SEEK_SET) - return (EOPNOTSUPP); - - ASSERT(vp); - ASSERT(vp->v_file); - ASSERT(bfp->l_start >= 0 && bfp->l_len > 0); - -#ifdef FALLOC_FL_PUNCH_HOLE - /* - * May enter XFS which generates a warning when PF_FSTRANS is set. - * To avoid this the flag is cleared over vfs_sync() and then reset. - */ - fstrans = __spl_pf_fstrans_check(); - if (fstrans) - current->flags &= ~(__SPL_PF_FSTRANS); - - /* - * When supported by the underlying file system preferentially - * use the fallocate() callback to preallocate the space. - */ - error = -spl_filp_fallocate(vp->v_file, - FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, - bfp->l_start, bfp->l_len); - - if (fstrans) - current->flags |= __SPL_PF_FSTRANS; - - if (error == 0) - return (0); -#endif - -#ifdef HAVE_INODE_TRUNCATE_RANGE - if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode && - vp->v_file->f_dentry->d_inode->i_op && - vp->v_file->f_dentry->d_inode->i_op->truncate_range) { - off_t end = bfp->l_start + bfp->l_len; - /* - * Judging from the code in shmem_truncate_range(), - * it seems the kernel expects the end offset to be - * inclusive and aligned to the end of a page. - */ - if (end % PAGE_SIZE != 0) { - end &= ~(off_t)(PAGE_SIZE - 1); - if (end <= bfp->l_start) - return (0); - } - --end; - - vp->v_file->f_dentry->d_inode->i_op->truncate_range( - vp->v_file->f_dentry->d_inode, bfp->l_start, end); - - return (0); - } -#endif - - return (error); -} -EXPORT_SYMBOL(vn_space); - -/* Function must be called while holding the vn_file_lock */ -static file_t * -file_find(int fd, struct task_struct *task) -{ - file_t *fp; - - list_for_each_entry(fp, &vn_file_list, f_list) { - if (fd == fp->f_fd && fp->f_task == task) { - ASSERT(atomic_read(&fp->f_ref) != 0); - return (fp); - } - } - - return (NULL); -} /* file_find() */ - -file_t * -vn_getf(int fd) -{ - struct kstat stat; - struct file *lfp; - file_t *fp; - vnode_t *vp; - int rc = 0; - - if (fd < 0) - return (NULL); - - /* Already open just take an extra reference */ - spin_lock(&vn_file_lock); - - fp = file_find(fd, current); - if (fp) { - lfp = fget(fd); - fput(fp->f_file); - /* - * areleasef() can cause us to see a stale reference when - * userspace has reused a file descriptor before areleasef() - * has run. fput() the stale reference and replace it. We - * retain the original reference count such that the concurrent - * areleasef() will decrement its reference and terminate. - */ - if (lfp != fp->f_file) { - fp->f_file = lfp; - fp->f_vnode->v_file = lfp; - } - atomic_inc(&fp->f_ref); - spin_unlock(&vn_file_lock); - return (fp); - } - - spin_unlock(&vn_file_lock); - - /* File was not yet opened create the object and setup */ - fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP); - if (fp == NULL) - goto out; - - mutex_enter(&fp->f_lock); - - fp->f_fd = fd; - fp->f_task = current; - fp->f_offset = 0; - atomic_inc(&fp->f_ref); - - lfp = fget(fd); - if (lfp == NULL) - goto out_mutex; - - vp = vn_alloc(KM_SLEEP); - if (vp == NULL) - goto out_fget; - -#if defined(HAVE_4ARGS_VFS_GETATTR) - rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE, - AT_STATX_SYNC_AS_STAT); -#elif defined(HAVE_2ARGS_VFS_GETATTR) - rc = vfs_getattr(&lfp->f_path, &stat); -#else - rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat); -#endif - if (rc) - goto out_vnode; - - mutex_enter(&vp->v_lock); - vp->v_type = vn_mode_to_vtype(stat.mode); - vp->v_file = lfp; - mutex_exit(&vp->v_lock); - - fp->f_vnode = vp; - fp->f_file = lfp; - - /* Put it on the tracking list */ - spin_lock(&vn_file_lock); - list_add(&fp->f_list, &vn_file_list); - spin_unlock(&vn_file_lock); - - mutex_exit(&fp->f_lock); - return (fp); - -out_vnode: - vn_free(vp); -out_fget: - fput(lfp); -out_mutex: - mutex_exit(&fp->f_lock); - kmem_cache_free(vn_file_cache, fp); -out: - return (NULL); -} /* getf() */ -EXPORT_SYMBOL(getf); - -static void releasef_locked(file_t *fp) -{ - ASSERT(fp->f_file); - ASSERT(fp->f_vnode); - - /* Unlinked from list, no refs, safe to free outside mutex */ - fput(fp->f_file); - vn_free(fp->f_vnode); - - kmem_cache_free(vn_file_cache, fp); -} - -void -vn_releasef(int fd) -{ - areleasef(fd, P_FINFO(current)); -} -EXPORT_SYMBOL(releasef); - -void -vn_areleasef(int fd, uf_info_t *fip) -{ - file_t *fp; - struct task_struct *task = (struct task_struct *)fip; - - if (fd < 0) - return; - - spin_lock(&vn_file_lock); - fp = file_find(fd, task); - if (fp) { - atomic_dec(&fp->f_ref); - if (atomic_read(&fp->f_ref) > 0) { - spin_unlock(&vn_file_lock); - return; - } - - list_del(&fp->f_list); - releasef_locked(fp); - } - spin_unlock(&vn_file_lock); -} /* releasef() */ -EXPORT_SYMBOL(areleasef); - -static int -vn_cache_constructor(void *buf, void *cdrarg, int kmflags) -{ - struct vnode *vp = buf; - - mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); - - return (0); -} /* vn_cache_constructor() */ - -static void -vn_cache_destructor(void *buf, void *cdrarg) -{ - struct vnode *vp = buf; - - mutex_destroy(&vp->v_lock); -} /* vn_cache_destructor() */ - -static int -vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags) -{ - file_t *fp = buf; - - atomic_set(&fp->f_ref, 0); - mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL); - INIT_LIST_HEAD(&fp->f_list); - - return (0); -} /* vn_file_cache_constructor() */ - -static void -vn_file_cache_destructor(void *buf, void *cdrarg) -{ - file_t *fp = buf; - - mutex_destroy(&fp->f_lock); -} /* vn_file_cache_destructor() */ - -int -spl_vn_init(void) -{ - spin_lock_init(&vn_file_lock); - - vn_cache = kmem_cache_create("spl_vn_cache", - sizeof (struct vnode), 64, vn_cache_constructor, - vn_cache_destructor, NULL, NULL, NULL, 0); - - vn_file_cache = kmem_cache_create("spl_vn_file_cache", - sizeof (file_t), 64, vn_file_cache_constructor, - vn_file_cache_destructor, NULL, NULL, NULL, 0); - - return (0); -} /* spl_vn_init() */ - -void -spl_vn_fini(void) -{ - file_t *fp, *next_fp; - int leaked = 0; - - spin_lock(&vn_file_lock); - - list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) { - list_del(&fp->f_list); - releasef_locked(fp); - leaked++; - } - - spin_unlock(&vn_file_lock); - - if (leaked > 0) - printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked); - - kmem_cache_destroy(vn_file_cache); - kmem_cache_destroy(vn_cache); -} /* spl_vn_fini() */ diff --git a/module/spl/spl-xdr.c b/module/spl/spl-xdr.c deleted file mode 100644 index 1dd31ffc1..000000000 --- a/module/spl/spl-xdr.c +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright (c) 2008-2010 Sun Microsystems, Inc. - * Written by Ricardo Correia <[email protected]> - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * Solaris Porting Layer (SPL) XDR Implementation. - */ - -#include <linux/string.h> -#include <sys/kmem.h> -#include <sys/debug.h> -#include <sys/types.h> -#include <sys/sysmacros.h> -#include <rpc/xdr.h> - -/* - * SPL's XDR mem implementation. - * - * This is used by libnvpair to serialize/deserialize the name-value pair data - * structures into byte arrays in a well-defined and portable manner. - * - * These data structures are used by the DMU/ZFS to flexibly manipulate various - * information in memory and later serialize it/deserialize it to disk. - * Examples of usages include the pool configuration, lists of pool and dataset - * properties, etc. - * - * Reference documentation for the XDR representation and XDR operations can be - * found in RFC 1832 and xdr(3), respectively. - * - * === Implementation shortcomings === - * - * It is assumed that the following C types have the following sizes: - * - * char/unsigned char: 1 byte - * short/unsigned short: 2 bytes - * int/unsigned int: 4 bytes - * longlong_t/u_longlong_t: 8 bytes - * - * The C standard allows these types to be larger (and in the case of ints, - * shorter), so if that is the case on some compiler/architecture, the build - * will fail (on purpose). - * - * If someone wants to fix the code to work properly on such environments, then: - * - * 1) Preconditions should be added to xdrmem_enc functions to make sure the - * caller doesn't pass arguments which exceed the expected range. - * 2) Functions which take signed integers should be changed to properly do - * sign extension. - * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger - * problems than this implementation. - * - * It is also assumed that: - * - * 1) Chars have 8 bits. - * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned - * memcpy, memset and memcmp. - * 3) Arrays passed to xdr_array() are packed and the compiler/architecture - * supports element-sized-aligned memory accesses. - * 4) Negative integers are natively stored in two's complement binary - * representation. - * - * No checks are done for the 4 assumptions above, though. - * - * === Caller expectations === - * - * Existing documentation does not describe the semantics of XDR operations very - * well. Therefore, some assumptions about failure semantics will be made and - * will be described below: - * - * 1) If any encoding operation fails (e.g., due to lack of buffer space), the - * the stream should be considered valid only up to the encoding operation - * previous to the one that first failed. However, the stream size as returned - * by xdr_control() cannot be considered to be strictly correct (it may be - * bigger). - * - * Putting it another way, if there is an encoding failure it's undefined - * whether anything is added to the stream in that operation and therefore - * neither xdr_control() nor future encoding operations on the same stream can - * be relied upon to produce correct results. - * - * 2) If a decoding operation fails, it's undefined whether anything will be - * decoded into passed buffers/pointers during that operation, or what the - * values on those buffers will look like. - * - * Future decoding operations on the same stream will also have similar - * undefined behavior. - * - * 3) When the first decoding operation fails it is OK to trust the results of - * previous decoding operations on the same stream, as long as the caller - * expects a failure to be possible (e.g. due to end-of-stream). - * - * However, this is highly discouraged because the caller should know the - * stream size and should be coded to expect any decoding failure to be data - * corruption due to hardware, accidental or even malicious causes, which should - * be handled gracefully in all cases. - * - * In very rare situations where there are strong reasons to believe the data - * can be trusted to be valid and non-tampered with, then the caller may assume - * a decoding failure to be a bug (e.g. due to mismatched data types) and may - * fail non-gracefully. - * - * 4) Non-zero padding bytes will cause the decoding operation to fail. - * - * 5) Zero bytes on string types will also cause the decoding operation to fail. - * - * 6) It is assumed that either the pointer to the stream buffer given by the - * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int - * memory accesses. - * - * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. - * - * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user - * space or MMIO space), the computer may explode. - */ - -static struct xdr_ops xdrmem_encode_ops; -static struct xdr_ops xdrmem_decode_ops; - -void -xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, - const enum xdr_op op) -{ - switch (op) { - case XDR_ENCODE: - xdrs->x_ops = &xdrmem_encode_ops; - break; - case XDR_DECODE: - xdrs->x_ops = &xdrmem_decode_ops; - break; - default: - xdrs->x_ops = NULL; /* Let the caller know we failed */ - return; - } - - xdrs->x_op = op; - xdrs->x_addr = addr; - xdrs->x_addr_end = addr + size; - - if (xdrs->x_addr_end < xdrs->x_addr) { - xdrs->x_ops = NULL; - } -} -EXPORT_SYMBOL(xdrmem_create); - -static bool_t -xdrmem_control(XDR *xdrs, int req, void *info) -{ - struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; - - if (req != XDR_GET_BYTES_AVAIL) - return (FALSE); - - rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ - rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; - - return (TRUE); -} - -static bool_t -xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) -{ - uint_t size = roundup(cnt, 4); - uint_t pad; - - if (size < cnt) - return (FALSE); /* Integer overflow */ - - if (xdrs->x_addr > xdrs->x_addr_end) - return (FALSE); - - if (xdrs->x_addr_end - xdrs->x_addr < size) - return (FALSE); - - memcpy(xdrs->x_addr, cp, cnt); - - xdrs->x_addr += cnt; - - pad = size - cnt; - if (pad > 0) { - memset(xdrs->x_addr, 0, pad); - xdrs->x_addr += pad; - } - - return (TRUE); -} - -static bool_t -xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) -{ - static uint32_t zero = 0; - uint_t size = roundup(cnt, 4); - uint_t pad; - - if (size < cnt) - return (FALSE); /* Integer overflow */ - - if (xdrs->x_addr > xdrs->x_addr_end) - return (FALSE); - - if (xdrs->x_addr_end - xdrs->x_addr < size) - return (FALSE); - - memcpy(cp, xdrs->x_addr, cnt); - xdrs->x_addr += cnt; - - pad = size - cnt; - if (pad > 0) { - /* An inverted memchr() would be useful here... */ - if (memcmp(&zero, xdrs->x_addr, pad) != 0) - return (FALSE); - - xdrs->x_addr += pad; - } - - return (TRUE); -} - -static bool_t -xdrmem_enc_uint32(XDR *xdrs, uint32_t val) -{ - if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) - return (FALSE); - - *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val); - - xdrs->x_addr += sizeof (uint32_t); - - return (TRUE); -} - -static bool_t -xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) -{ - if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) - return (FALSE); - - *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr)); - - xdrs->x_addr += sizeof (uint32_t); - - return (TRUE); -} - -static bool_t -xdrmem_enc_char(XDR *xdrs, char *cp) -{ - uint32_t val; - - BUILD_BUG_ON(sizeof (char) != 1); - val = *((unsigned char *) cp); - - return (xdrmem_enc_uint32(xdrs, val)); -} - -static bool_t -xdrmem_dec_char(XDR *xdrs, char *cp) -{ - uint32_t val; - - BUILD_BUG_ON(sizeof (char) != 1); - - if (!xdrmem_dec_uint32(xdrs, &val)) - return (FALSE); - - /* - * If any of the 3 other bytes are non-zero then val will be greater - * than 0xff and we fail because according to the RFC, this block does - * not have a char encoded in it. - */ - if (val > 0xff) - return (FALSE); - - *((unsigned char *) cp) = val; - - return (TRUE); -} - -static bool_t -xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) -{ - BUILD_BUG_ON(sizeof (unsigned short) != 2); - - return (xdrmem_enc_uint32(xdrs, *usp)); -} - -static bool_t -xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) -{ - uint32_t val; - - BUILD_BUG_ON(sizeof (unsigned short) != 2); - - if (!xdrmem_dec_uint32(xdrs, &val)) - return (FALSE); - - /* - * Short ints are not in the RFC, but we assume similar logic as in - * xdrmem_dec_char(). - */ - if (val > 0xffff) - return (FALSE); - - *usp = val; - - return (TRUE); -} - -static bool_t -xdrmem_enc_uint(XDR *xdrs, unsigned *up) -{ - BUILD_BUG_ON(sizeof (unsigned) != 4); - - return (xdrmem_enc_uint32(xdrs, *up)); -} - -static bool_t -xdrmem_dec_uint(XDR *xdrs, unsigned *up) -{ - BUILD_BUG_ON(sizeof (unsigned) != 4); - - return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); -} - -static bool_t -xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) -{ - BUILD_BUG_ON(sizeof (u_longlong_t) != 8); - - if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) - return (FALSE); - - return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); -} - -static bool_t -xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) -{ - uint32_t low, high; - - BUILD_BUG_ON(sizeof (u_longlong_t) != 8); - - if (!xdrmem_dec_uint32(xdrs, &high)) - return (FALSE); - if (!xdrmem_dec_uint32(xdrs, &low)) - return (FALSE); - - *ullp = ((u_longlong_t)high << 32) | low; - - return (TRUE); -} - -static bool_t -xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, - const uint_t elsize, const xdrproc_t elproc) -{ - uint_t i; - caddr_t addr = *arrp; - - if (*sizep > maxsize || *sizep > UINT_MAX / elsize) - return (FALSE); - - if (!xdrmem_enc_uint(xdrs, sizep)) - return (FALSE); - - for (i = 0; i < *sizep; i++) { - if (!elproc(xdrs, addr)) - return (FALSE); - addr += elsize; - } - - return (TRUE); -} - -static bool_t -xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, - const uint_t elsize, const xdrproc_t elproc) -{ - uint_t i, size; - bool_t alloc = FALSE; - caddr_t addr; - - if (!xdrmem_dec_uint(xdrs, sizep)) - return (FALSE); - - size = *sizep; - - if (size > maxsize || size > UINT_MAX / elsize) - return (FALSE); - - /* - * The Solaris man page says: "If *arrp is NULL when decoding, - * xdr_array() allocates memory and *arrp points to it". - */ - if (*arrp == NULL) { - BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); - - *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); - if (*arrp == NULL) - return (FALSE); - - alloc = TRUE; - } - - addr = *arrp; - - for (i = 0; i < size; i++) { - if (!elproc(xdrs, addr)) { - if (alloc) - kmem_free(*arrp, size * elsize); - return (FALSE); - } - addr += elsize; - } - - return (TRUE); -} - -static bool_t -xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) -{ - size_t slen = strlen(*sp); - uint_t len; - - if (slen > maxsize) - return (FALSE); - - len = slen; - - if (!xdrmem_enc_uint(xdrs, &len)) - return (FALSE); - - return (xdrmem_enc_bytes(xdrs, *sp, len)); -} - -static bool_t -xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) -{ - uint_t size; - bool_t alloc = FALSE; - - if (!xdrmem_dec_uint(xdrs, &size)) - return (FALSE); - - if (size > maxsize || size > UINT_MAX - 1) - return (FALSE); - - /* - * Solaris man page: "If *sp is NULL when decoding, xdr_string() - * allocates memory and *sp points to it". - */ - if (*sp == NULL) { - BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); - - *sp = kmem_alloc(size + 1, KM_NOSLEEP); - if (*sp == NULL) - return (FALSE); - - alloc = TRUE; - } - - if (!xdrmem_dec_bytes(xdrs, *sp, size)) - goto fail; - - if (memchr(*sp, 0, size) != NULL) - goto fail; - - (*sp)[size] = '\0'; - - return (TRUE); - -fail: - if (alloc) - kmem_free(*sp, size + 1); - - return (FALSE); -} - -static struct xdr_ops xdrmem_encode_ops = { - .xdr_control = xdrmem_control, - .xdr_char = xdrmem_enc_char, - .xdr_u_short = xdrmem_enc_ushort, - .xdr_u_int = xdrmem_enc_uint, - .xdr_u_longlong_t = xdrmem_enc_ulonglong, - .xdr_opaque = xdrmem_enc_bytes, - .xdr_string = xdr_enc_string, - .xdr_array = xdr_enc_array -}; - -static struct xdr_ops xdrmem_decode_ops = { - .xdr_control = xdrmem_control, - .xdr_char = xdrmem_dec_char, - .xdr_u_short = xdrmem_dec_ushort, - .xdr_u_int = xdrmem_dec_uint, - .xdr_u_longlong_t = xdrmem_dec_ulonglong, - .xdr_opaque = xdrmem_dec_bytes, - .xdr_string = xdr_dec_string, - .xdr_array = xdr_dec_array -}; diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c deleted file mode 100644 index 62423343c..000000000 --- a/module/spl/spl-zlib.c +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. - * Copyright (C) 2007 The Regents of the University of California. - * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). - * Written by Brian Behlendorf <[email protected]>. - * UCRL-CODE-235197 - * - * This file is part of the SPL, Solaris Porting Layer. - * For details, see <http://zfsonlinux.org/>. - * - * The SPL is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - * - * The SPL is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * for more details. - * - * You should have received a copy of the GNU General Public License along - * with the SPL. If not, see <http://www.gnu.org/licenses/>. - * - * - * z_compress_level/z_uncompress are nearly identical copies of the - * compress2/uncompress functions provided by the official zlib package - * available at http://zlib.net/. The only changes made we to slightly - * adapt the functions called to match the linux kernel implementation - * of zlib. The full zlib license follows: - * - * zlib.h -- interface of the 'zlib' general purpose compression library - * version 1.2.5, April 19th, 2010 - * - * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the authors be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - * - * Jean-loup Gailly - * Mark Adler - */ - - -#include <sys/kmem.h> -#include <sys/kmem_cache.h> -#include <sys/zmod.h> - -static spl_kmem_cache_t *zlib_workspace_cache; - -/* - * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc - * and vfree for every call. Using a kmem_cache also has the advantage - * that improves the odds that the memory used will be local to this cpu. - * To further improve things it might be wise to create a dedicated per-cpu - * workspace for use. This would take some additional care because we then - * must disable preemption around the critical section, and verify that - * zlib_deflate* and zlib_inflate* never internally call schedule(). - */ -static void * -zlib_workspace_alloc(int flags) -{ - return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); -} - -static void -zlib_workspace_free(void *workspace) -{ - kmem_cache_free(zlib_workspace_cache, workspace); -} - -/* - * Compresses the source buffer into the destination buffer. The level - * parameter has the same meaning as in deflateInit. sourceLen is the byte - * length of the source buffer. Upon entry, destLen is the total size of the - * destination buffer, which must be at least 0.1% larger than sourceLen plus - * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. - * - * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough - * memory, Z_BUF_ERROR if there was not enough room in the output buffer, - * Z_STREAM_ERROR if the level parameter is invalid. - */ -int -z_compress_level(void *dest, size_t *destLen, const void *source, - size_t sourceLen, int level) -{ - z_stream stream; - int err; - - stream.next_in = (Byte *)source; - stream.avail_in = (uInt)sourceLen; - stream.next_out = dest; - stream.avail_out = (uInt)*destLen; - - if ((size_t)stream.avail_out != *destLen) - return (Z_BUF_ERROR); - - stream.workspace = zlib_workspace_alloc(KM_SLEEP); - if (!stream.workspace) - return (Z_MEM_ERROR); - - err = zlib_deflateInit(&stream, level); - if (err != Z_OK) { - zlib_workspace_free(stream.workspace); - return (err); - } - - err = zlib_deflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) { - zlib_deflateEnd(&stream); - zlib_workspace_free(stream.workspace); - return (err == Z_OK ? Z_BUF_ERROR : err); - } - *destLen = stream.total_out; - - err = zlib_deflateEnd(&stream); - zlib_workspace_free(stream.workspace); - - return (err); -} -EXPORT_SYMBOL(z_compress_level); - -/* - * Decompresses the source buffer into the destination buffer. sourceLen is - * the byte length of the source buffer. Upon entry, destLen is the total - * size of the destination buffer, which must be large enough to hold the - * entire uncompressed data. (The size of the uncompressed data must have - * been saved previously by the compressor and transmitted to the decompressor - * by some mechanism outside the scope of this compression library.) - * Upon exit, destLen is the actual size of the compressed buffer. - * This function can be used to decompress a whole file at once if the - * input file is mmap'ed. - * - * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not - * enough memory, Z_BUF_ERROR if there was not enough room in the output - * buffer, or Z_DATA_ERROR if the input data was corrupted. - */ -int -z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) -{ - z_stream stream; - int err; - - stream.next_in = (Byte *)source; - stream.avail_in = (uInt)sourceLen; - stream.next_out = dest; - stream.avail_out = (uInt)*destLen; - - if ((size_t)stream.avail_out != *destLen) - return (Z_BUF_ERROR); - - stream.workspace = zlib_workspace_alloc(KM_SLEEP); - if (!stream.workspace) - return (Z_MEM_ERROR); - - err = zlib_inflateInit(&stream); - if (err != Z_OK) { - zlib_workspace_free(stream.workspace); - return (err); - } - - err = zlib_inflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) { - zlib_inflateEnd(&stream); - zlib_workspace_free(stream.workspace); - - if (err == Z_NEED_DICT || - (err == Z_BUF_ERROR && stream.avail_in == 0)) - return (Z_DATA_ERROR); - - return (err); - } - *destLen = stream.total_out; - - err = zlib_inflateEnd(&stream); - zlib_workspace_free(stream.workspace); - - return (err); -} -EXPORT_SYMBOL(z_uncompress); - -int -spl_zlib_init(void) -{ - int size; - - size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), - zlib_inflate_workspacesize()); - - zlib_workspace_cache = kmem_cache_create( - "spl_zlib_workspace_cache", - size, 0, NULL, NULL, NULL, NULL, NULL, - KMC_VMEM); - if (!zlib_workspace_cache) - return (1); - - return (0); -} - -void -spl_zlib_fini(void) -{ - kmem_cache_destroy(zlib_workspace_cache); - zlib_workspace_cache = NULL; -} |