diff options
Diffstat (limited to 'module/spl')
-rw-r--r-- | module/spl/THIRDPARTYLICENSE.gplv2 | 339 | ||||
-rw-r--r-- | module/spl/THIRDPARTYLICENSE.gplv2.descrip | 1 | ||||
-rw-r--r-- | module/spl/spl-atomic.c | 36 | ||||
-rw-r--r-- | module/spl/spl-condvar.c | 410 | ||||
-rw-r--r-- | module/spl/spl-cred.c | 200 | ||||
-rw-r--r-- | module/spl/spl-err.c | 133 | ||||
-rw-r--r-- | module/spl/spl-generic.c | 775 | ||||
-rw-r--r-- | module/spl/spl-kmem-cache.c | 1769 | ||||
-rw-r--r-- | module/spl/spl-kmem.c | 567 | ||||
-rw-r--r-- | module/spl/spl-kobj.c | 86 | ||||
-rw-r--r-- | module/spl/spl-kstat.c | 733 | ||||
-rw-r--r-- | module/spl/spl-mutex.c | 30 | ||||
-rw-r--r-- | module/spl/spl-proc.c | 782 | ||||
-rw-r--r-- | module/spl/spl-rwlock.c | 114 | ||||
-rw-r--r-- | module/spl/spl-taskq.c | 1305 | ||||
-rw-r--r-- | module/spl/spl-thread.c | 160 | ||||
-rw-r--r-- | module/spl/spl-tsd.c | 720 | ||||
-rw-r--r-- | module/spl/spl-vmem.c | 135 | ||||
-rw-r--r-- | module/spl/spl-vnode.c | 779 | ||||
-rw-r--r-- | module/spl/spl-xdr.c | 515 | ||||
-rw-r--r-- | module/spl/spl-zlib.c | 217 |
21 files changed, 9806 insertions, 0 deletions
diff --git a/module/spl/THIRDPARTYLICENSE.gplv2 b/module/spl/THIRDPARTYLICENSE.gplv2 new file mode 100644 index 000000000..d159169d1 --- /dev/null +++ b/module/spl/THIRDPARTYLICENSE.gplv2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/module/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/spl/THIRDPARTYLICENSE.gplv2.descrip new file mode 100644 index 000000000..78535a8ee --- /dev/null +++ b/module/spl/THIRDPARTYLICENSE.gplv2.descrip @@ -0,0 +1 @@ +COMPATIBILITY LAYER FOR OPENZFS ON LINUX diff --git a/module/spl/spl-atomic.c b/module/spl/spl-atomic.c new file mode 100644 index 000000000..47ed1886e --- /dev/null +++ b/module/spl/spl-atomic.c @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Atomic Implementation. + */ + +#include <sys/atomic.h> + +#ifdef ATOMIC_SPINLOCK +/* Global atomic lock declarations */ +DEFINE_SPINLOCK(atomic32_lock); +DEFINE_SPINLOCK(atomic64_lock); + +EXPORT_SYMBOL(atomic32_lock); +EXPORT_SYMBOL(atomic64_lock); +#endif /* ATOMIC_SPINLOCK */ diff --git a/module/spl/spl-condvar.c b/module/spl/spl-condvar.c new file mode 100644 index 000000000..f0060bbdc --- /dev/null +++ b/module/spl/spl-condvar.c @@ -0,0 +1,410 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include <sys/condvar.h> +#include <sys/time.h> +#include <linux/hrtimer.h> + +void +__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ + ASSERT(cvp); + ASSERT(name == NULL); + ASSERT(type == CV_DEFAULT); + ASSERT(arg == NULL); + + cvp->cv_magic = CV_MAGIC; + init_waitqueue_head(&cvp->cv_event); + init_waitqueue_head(&cvp->cv_destroy); + atomic_set(&cvp->cv_waiters, 0); + atomic_set(&cvp->cv_refs, 1); + cvp->cv_mutex = NULL; +} +EXPORT_SYMBOL(__cv_init); + +static int +cv_destroy_wakeup(kcondvar_t *cvp) +{ + if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { + ASSERT(cvp->cv_mutex == NULL); + ASSERT(!waitqueue_active(&cvp->cv_event)); + return (1); + } + + return (0); +} + +void +__cv_destroy(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + + cvp->cv_magic = CV_DESTROY; + atomic_dec(&cvp->cv_refs); + + /* Block until all waiters are woken and references dropped. */ + while (cv_destroy_wakeup(cvp) == 0) + wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); + + ASSERT3P(cvp->cv_mutex, ==, NULL); + ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); + ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); + ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); +} +EXPORT_SYMBOL(__cv_destroy); + +static void +cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + atomic_inc(&cvp->cv_refs); + + m = ACCESS_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + io_schedule(); + else + schedule(); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); +} + +void +__cv_wait(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(__cv_wait); + +void +__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1); +} +EXPORT_SYMBOL(__cv_wait_io); + +void +__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(__cv_wait_sig); + +#if defined(HAVE_IO_SCHEDULE_TIMEOUT) +#define spl_io_schedule_timeout(t) io_schedule_timeout(t) +#else +static void +__cv_wakeup(unsigned long data) +{ + wake_up_process((struct task_struct *)data); +} + +static long +spl_io_schedule_timeout(long time_left) +{ + long expire_time = jiffies + time_left; + struct timer_list timer; + + init_timer(&timer); + setup_timer(&timer, __cv_wakeup, (unsigned long)current); + timer.expires = expire_time; + add_timer(&timer); + + io_schedule(); + + del_timer_sync(&timer); + time_left = expire_time - jiffies; + + return (time_left < 0 ? 0 : time_left); +} +#endif + +/* + * 'expire_time' argument is an absolute wall clock time in jiffies. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, + int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + clock_t time_left; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + /* XXX - Does not handle jiffie wrap properly */ + time_left = expire_time - jiffies; + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = ACCESS_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + time_left = spl_io_schedule_timeout(time_left); + else + time_left = schedule_timeout(time_left); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); + return (time_left > 0 ? time_left : -1); +} + +clock_t +__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait); + +clock_t +__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 1)); +} +EXPORT_SYMBOL(__cv_timedwait_io); + +clock_t +__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_INTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait_sig); + +/* + * 'expire_time' argument is an absolute clock time in nanoseconds. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, + int state) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + hrtime_t time_left; + ktime_t ktime_left; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + time_left = expire_time - gethrtime(); + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = ACCESS_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + /* + * Allow a 100 us range to give kernel an opportunity to coalesce + * interrupts + */ + ktime_left = ktime_set(0, time_left); + schedule_hrtimeout_range(&ktime_left, 100 * NSEC_PER_USEC, + HRTIMER_MODE_REL); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + mutex_enter(mp); + time_left = expire_time - gethrtime(); + return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); +} + +/* + * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. + */ +static clock_t +cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag, int state) +{ + if (res > 1) { + /* + * Align expiration to the specified resolution. + */ + if (flag & CALLOUT_FLAG_ROUNDUP) + tim += res - 1; + tim = (tim / res) * res; + } + + if (!(flag & CALLOUT_FLAG_ABSOLUTE)) + tim += gethrtime(); + + return (__cv_timedwait_hires(cvp, mp, tim, state)); +} + +clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_UNINTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_hires); + +clock_t +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_sig_hires); + +void +__cv_signal(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * All waiters are added with WQ_FLAG_EXCLUSIVE so only one + * waiter will be set runable with each call to wake_up(). + * Additionally wake_up() holds a spin_lock assoicated with + * the wait queue to ensure we don't race waking up processes. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_signal); + +void +__cv_broadcast(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * Wake_up_all() will wake up all waiters even those which + * have the WQ_FLAG_EXCLUSIVE flag set. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up_all(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_broadcast); diff --git a/module/spl/spl-cred.c b/module/spl/spl-cred.c new file mode 100644 index 000000000..ea3e903f9 --- /dev/null +++ b/module/spl/spl-cred.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include <sys/cred.h> + +static int +#ifdef HAVE_KUIDGID_T +cr_groups_search(const struct group_info *group_info, kgid_t grp) +#else +cr_groups_search(const struct group_info *group_info, gid_t grp) +#endif +{ + unsigned int left, right, mid; + int cmp; + + if (!group_info) + return (0); + + left = 0; + right = group_info->ngroups; + while (left < right) { + mid = (left + right) / 2; + cmp = KGID_TO_SGID(grp) - + KGID_TO_SGID(GROUP_AT(group_info, mid)); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return (1); + } + return (0); +} + +/* Hold a reference on the credential */ +void +crhold(cred_t *cr) +{ + (void) get_cred((const cred_t *)cr); +} + +/* Free a reference on the credential */ +void +crfree(cred_t *cr) +{ + put_cred((const cred_t *)cr); +} + +/* Return the number of supplemental groups */ +int +crgetngroups(const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = gi->ngroups; +#ifndef HAVE_GROUP_INFO_GID + /* + * For Linux <= 4.8, + * crgetgroups will only returns gi->blocks[0], which contains only + * the first NGROUPS_PER_BLOCK groups. + */ + if (rc > NGROUPS_PER_BLOCK) { + WARN_ON_ONCE(1); + rc = NGROUPS_PER_BLOCK; + } +#endif + return (rc); +} + +/* + * Return an array of supplemental gids. The returned address is safe + * to use as long as the caller has taken a reference with crhold(). + * + * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d + * array via ->gid. + */ +gid_t * +crgetgroups(const cred_t *cr) +{ + struct group_info *gi; + gid_t *gids = NULL; + + gi = cr->group_info; +#ifdef HAVE_GROUP_INFO_GID + gids = KGIDP_TO_SGIDP(gi->gid); +#else + if (gi->nblocks > 0) + gids = KGIDP_TO_SGIDP(gi->blocks[0]); +#endif + return (gids); +} + +/* Check if the passed gid is available in supplied credential. */ +int +groupmember(gid_t gid, const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = cr_groups_search(gi, SGID_TO_KGID(gid)); + + return (rc); +} + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->euid)); +} + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->uid)); +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->suid)); +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->fsuid)); +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->egid)); +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->gid)); +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->sgid)); +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->fsgid)); +} + +EXPORT_SYMBOL(crhold); +EXPORT_SYMBOL(crfree); +EXPORT_SYMBOL(crgetuid); +EXPORT_SYMBOL(crgetruid); +EXPORT_SYMBOL(crgetsuid); +EXPORT_SYMBOL(crgetfsuid); +EXPORT_SYMBOL(crgetgid); +EXPORT_SYMBOL(crgetrgid); +EXPORT_SYMBOL(crgetsgid); +EXPORT_SYMBOL(crgetfsgid); +EXPORT_SYMBOL(crgetngroups); +EXPORT_SYMBOL(crgetgroups); +EXPORT_SYMBOL(groupmember); diff --git a/module/spl/spl-err.c b/module/spl/spl-err.c new file mode 100644 index 000000000..6b71296e8 --- /dev/null +++ b/module/spl/spl-err.c @@ -0,0 +1,133 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Error Implementation. + */ + +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> +#include <linux/ratelimit.h> + +/* + * It is often useful to actually have the panic crash the node so you + * can then get notified of the event, get the crashdump for later + * analysis and other such goodies. + * But we would still default to the current default of not to do that. + */ +/* BEGIN CSTYLED */ +unsigned int spl_panic_halt; +module_param(spl_panic_halt, uint, 0644); +MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); +/* END CSTYLED */ + +/* + * Limit the number of stack traces dumped to not more than 5 every + * 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(dumpstack_ratelimit_state, 60 * HZ, 5); + +void +spl_dumpstack(void) +{ + if (__ratelimit(&dumpstack_ratelimit_state)) { + printk("Showing stack for process %d\n", current->pid); + dump_stack(); + } +} +EXPORT_SYMBOL(spl_dumpstack); + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char msg[MAXMSGLEN]; + va_list ap; + + newfile = strrchr(file, '/'); + if (newfile != NULL) + newfile = newfile + 1; + else + newfile = file; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + printk(KERN_EMERG "%s", msg); + printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func); + if (spl_panic_halt) + panic("%s", msg); + + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + + /* Unreachable */ + return (1); +} +EXPORT_SYMBOL(spl_panic); + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + vsnprintf(msg, MAXMSGLEN - 1, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + printk("%s", msg); + break; + case CE_NOTE: + printk(KERN_NOTICE "NOTICE: %s\n", msg); + break; + case CE_WARN: + printk(KERN_WARNING "WARNING: %s\n", msg); + break; + case CE_PANIC: + printk(KERN_EMERG "PANIC: %s\n", msg); + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + } +} /* vcmn_err() */ +EXPORT_SYMBOL(vcmn_err); + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ +EXPORT_SYMBOL(cmn_err); diff --git a/module/spl/spl-generic.c b/module/spl/spl-generic.c new file mode 100644 index 000000000..b38fe254c --- /dev/null +++ b/module/spl/spl-generic.c @@ -0,0 +1,775 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Generic Implementation. + */ + +#include <sys/sysmacros.h> +#include <sys/systeminfo.h> +#include <sys/vmsystm.h> +#include <sys/kobj.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/vmem.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/taskq.h> +#include <sys/tsd.h> +#include <sys/zmod.h> +#include <sys/debug.h> +#include <sys/proc.h> +#include <sys/kstat.h> +#include <sys/file.h> +#include <linux/ctype.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/strings.h> +#include <linux/kmod.h> + +char spl_version[32] = "SPL v" SPL_META_VERSION "-" SPL_META_RELEASE; +EXPORT_SYMBOL(spl_version); + +/* BEGIN CSTYLED */ +unsigned long spl_hostid = 0; +EXPORT_SYMBOL(spl_hostid); +module_param(spl_hostid, ulong, 0644); +MODULE_PARM_DESC(spl_hostid, "The system hostid."); +/* END CSTYLED */ + +proc_t p0; +EXPORT_SYMBOL(p0); + +/* + * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * + * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose + * is to provide bytes containing random numbers. It is mapped to /dev/urandom + * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's + * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so + * we can implement it using a fast PRNG that we seed using Linux' actual + * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU + * with an independent seed so that all calls to random_get_pseudo_bytes() are + * free of atomic instructions. + * + * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() + * to generate words larger than 128 bits will paradoxically be limited to + * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` + * 128-bit words and selecting the first will implicitly select the second. If + * a caller finds this behavior undesireable, random_get_bytes() should be used + * instead. + * + * XXX: Linux interrupt handlers that trigger within the critical section + * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * see the same numbers. Nothing in the code currently calls this in an + * interrupt handler, so this is considered to be okay. If that becomes a + * problem, we could create a set of per-cpu variables for interrupt handlers + * and use them when in_interrupt() from linux/preempt_mask.h evaluates to + * true. + */ +static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy); + +/* + * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed + * file: + * + * http://xorshift.di.unimi.it/xorshift128plus.c + */ + +static inline uint64_t +spl_rand_next(uint64_t *s) +{ + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + s[0] = s0; + s1 ^= s1 << 23; // a + s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c + return (s[1] + s0); +} + +static inline void +spl_rand_jump(uint64_t *s) +{ + static const uint64_t JUMP[] = + { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i, b; + for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= s[0]; + s1 ^= s[1]; + } + (void) spl_rand_next(s); + } + + s[0] = s0; + s[1] = s1; +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + uint64_t *xp, s[2]; + + ASSERT(ptr); + + xp = get_cpu_var(spl_pseudo_entropy); + + s[0] = xp[0]; + s[1] = xp[1]; + + while (len) { + union { + uint64_t ui64; + uint8_t byte[sizeof (uint64_t)]; + }entropy; + int i = MIN(len, sizeof (uint64_t)); + + len -= i; + entropy.ui64 = spl_rand_next(s); + + while (i--) + *ptr++ = entropy.byte[i]; + } + + xp[0] = s[0]; + xp[1] = s[1]; + + put_cpu_var(spl_pseudo_entropy); + + return (0); +} + + +EXPORT_SYMBOL(random_get_pseudo_bytes); + +#if BITS_PER_LONG == 32 +/* + * Support 64/64 => 64 division on a 32-bit platform. While the kernel + * provides a div64_u64() function for this we do not use it because the + * implementation is flawed. There are cases which return incorrect + * results as late as linux-2.6.35. Until this is fixed upstream the + * spl must provide its own implementation. + * + * This implementation is a slightly modified version of the algorithm + * proposed by the book 'Hacker's Delight'. The original source can be + * found here and is available for use without restriction. + * + * http://www.hackersdelight.org/HDcode/newCode/divDouble.c + */ + +/* + * Calculate number of leading of zeros for a 64-bit value. + */ +static int +nlz64(uint64_t x) +{ + register int n = 0; + + if (x == 0) + return (64); + + if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } + if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } + if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } + if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } + if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } + if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } + + return (n); +} + +/* + * Newer kernels have a div_u64() function but we define our own + * to simplify portibility between kernel versions. + */ +static inline uint64_t +__div_u64(uint64_t u, uint32_t v) +{ + (void) do_div(u, v); + return (u); +} + +/* + * Implementation of 64-bit unsigned division for 32-bit machines. + * + * First the procedure takes care of the case in which the divisor is a + * 32-bit quantity. There are two subcases: (1) If the left half of the + * dividend is less than the divisor, one execution of do_div() is all that + * is required (overflow is not possible). (2) Otherwise it does two + * divisions, using the grade school method. + */ +uint64_t +__udivdi3(uint64_t u, uint64_t v) +{ + uint64_t u0, u1, v1, q0, q1, k; + int n; + + if (v >> 32 == 0) { // If v < 2**32: + if (u >> 32 < v) { // If u/v cannot overflow, + return (__div_u64(u, v)); // just do one division. + } else { // If u/v would overflow: + u1 = u >> 32; // Break u into two halves. + u0 = u & 0xFFFFFFFF; + q1 = __div_u64(u1, v); // First quotient digit. + k = u1 - q1 * v; // First remainder, < v. + u0 += (k << 32); + q0 = __div_u64(u0, v); // Seconds quotient digit. + return ((q1 << 32) + q0); + } + } else { // If v >= 2**32: + n = nlz64(v); // 0 <= n <= 31. + v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. + u1 = u >> 1; // To ensure no overflow. + q1 = __div_u64(u1, v1); // Get quotient from + q0 = (q1 << n) >> 31; // Undo normalization and + // division of u by 2. + if (q0 != 0) // Make q0 correct or + q0 = q0 - 1; // too small by 1. + if ((u - q0 * v) >= v) + q0 = q0 + 1; // Now q0 is correct. + + return (q0); + } +} +EXPORT_SYMBOL(__udivdi3); + +/* BEGIN CSTYLED */ +#ifndef abs64 +#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) +#endif +/* END CSTYLED */ + +/* + * Implementation of 64-bit signed division for 32-bit machines. + */ +int64_t +__divdi3(int64_t u, int64_t v) +{ + int64_t q, t; + q = __udivdi3(abs64(u), abs64(v)); + t = (u ^ v) >> 63; // If u, v have different + return ((q ^ t) - t); // signs, negate q. +} +EXPORT_SYMBOL(__divdi3); + +/* + * Implementation of 64-bit unsigned modulo for 32-bit machines. + */ +uint64_t +__umoddi3(uint64_t dividend, uint64_t divisor) +{ + return (dividend - (divisor * __udivdi3(dividend, divisor))); +} +EXPORT_SYMBOL(__umoddi3); + +/* + * Implementation of 64-bit unsigned division/modulo for 32-bit machines. + */ +uint64_t +__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) +{ + uint64_t q = __udivdi3(n, d); + if (r) + *r = n - d * q; + return (q); +} +EXPORT_SYMBOL(__udivmoddi4); + +/* + * Implementation of 64-bit signed division/modulo for 32-bit machines. + */ +int64_t +__divmoddi4(int64_t n, int64_t d, int64_t *r) +{ + int64_t q, rr; + boolean_t nn = B_FALSE; + boolean_t nd = B_FALSE; + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) { + nd = B_TRUE; + d = -d; + } + + q = __udivmoddi4(n, d, (uint64_t *)&rr); + + if (nn != nd) + q = -q; + if (nn) + rr = -rr; + if (r) + *r = rr; + return (q); +} +EXPORT_SYMBOL(__divmoddi4); + +#if defined(__arm) || defined(__arm__) +/* + * Implementation of 64-bit (un)signed division for 32-bit arm machines. + * + * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) + * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, + * and the remainder in {r2, r3}. The return type is specifically left + * set to 'void' to ensure the compiler does not overwrite these registers + * during the return. All results are in registers as per ABI + */ +void +__aeabi_uldivmod(uint64_t u, uint64_t v) +{ + uint64_t res; + uint64_t mod; + + res = __udivdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_uldivmod); + +void +__aeabi_ldivmod(int64_t u, int64_t v) +{ + int64_t res; + uint64_t mod; + + res = __divdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_ldivmod); +#endif /* __arm || __arm__ */ +#endif /* BITS_PER_LONG */ + +/* + * NOTE: The strtoxx behavior is solely based on my reading of the Solaris + * ddi_strtol(9F) man page. I have not verified the behavior of these + * functions against their Solaris counterparts. It is possible that I + * may have misinterpreted the man page or the man page is incorrect. + */ +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_strtoull(const char *, char **, int, unsigned long long *); +int ddi_strtoll(const char *, char **, int, long long *); + +#define define_ddi_strtoux(type, valtype) \ +int ddi_strtou##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + valtype last_value, value = 0; \ + char *ptr = (char *)str; \ + int flag = 1, digit; \ + \ + if (strlen(ptr) == 0) \ + return (EINVAL); \ + \ + /* Auto-detect base based on prefix */ \ + if (!base) { \ + if (str[0] == '0') { \ + if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ + base = 16; /* hex */ \ + ptr += 2; \ + } else if (str[1] >= '0' && str[1] < 8) { \ + base = 8; /* octal */ \ + ptr += 1; \ + } else { \ + return (EINVAL); \ + } \ + } else { \ + base = 10; /* decimal */ \ + } \ + } \ + \ + while (1) { \ + if (isdigit(*ptr)) \ + digit = *ptr - '0'; \ + else if (isalpha(*ptr)) \ + digit = tolower(*ptr) - 'a' + 10; \ + else \ + break; \ + \ + if (digit >= base) \ + break; \ + \ + last_value = value; \ + value = value * base + digit; \ + if (last_value > value) /* Overflow */ \ + return (ERANGE); \ + \ + flag = 1; \ + ptr++; \ + } \ + \ + if (flag) \ + *result = value; \ + \ + if (endptr) \ + *endptr = (char *)(flag ? ptr : str); \ + \ + return (0); \ +} \ + +#define define_ddi_strtox(type, valtype) \ +int ddi_strto##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + int rc; \ + \ + if (*str == '-') { \ + rc = ddi_strtou##type(str + 1, endptr, base, result); \ + if (!rc) { \ + if (*endptr == str + 1) \ + *endptr = (char *)str; \ + else \ + *result = -*result; \ + } \ + } else { \ + rc = ddi_strtou##type(str, endptr, base, result); \ + } \ + \ + return (rc); \ +} + +define_ddi_strtoux(l, unsigned long) +define_ddi_strtox(l, long) +define_ddi_strtoux(ll, unsigned long long) +define_ddi_strtox(ll, long long) + +EXPORT_SYMBOL(ddi_strtoul); +EXPORT_SYMBOL(ddi_strtol); +EXPORT_SYMBOL(ddi_strtoll); +EXPORT_SYMBOL(ddi_strtoull); + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyin); + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyout); + +/* + * Read the unique system identifier from the /etc/hostid file. + * + * The behavior of /usr/bin/hostid on Linux systems with the + * regular eglibc and coreutils is: + * + * 1. Generate the value if the /etc/hostid file does not exist + * or if the /etc/hostid file is less than four bytes in size. + * + * 2. If the /etc/hostid file is at least 4 bytes, then return + * the first four bytes [0..3] in native endian order. + * + * 3. Always ignore bytes [4..] if they exist in the file. + * + * Only the first four bytes are significant, even on systems that + * have a 64-bit word size. + * + * See: + * + * eglibc: sysdeps/unix/sysv/linux/gethostid.c + * coreutils: src/hostid.c + * + * Notes: + * + * The /etc/hostid file on Solaris is a text file that often reads: + * + * # DO NOT EDIT + * "0123456789" + * + * Directly copying this file to Linux results in a constant + * hostid of 4f442023 because the default comment constitutes + * the first four bytes of the file. + * + */ + +char *spl_hostid_path = HW_HOSTID_PATH; +module_param(spl_hostid_path, charp, 0444); +MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); + +static int +hostid_read(uint32_t *hostid) +{ + uint64_t size; + struct _buf *file; + uint32_t value = 0; + int error; + + file = kobj_open_file(spl_hostid_path); + if (file == (struct _buf *)-1) + return (ENOENT); + + error = kobj_get_filesize(file, &size); + if (error) { + kobj_close_file(file); + return (error); + } + + if (size < sizeof (HW_HOSTID_MASK)) { + kobj_close_file(file); + return (EINVAL); + } + + /* + * Read directly into the variable like eglibc does. + * Short reads are okay; native behavior is preserved. + */ + error = kobj_read_file(file, (char *)&value, sizeof (value), 0); + if (error < 0) { + kobj_close_file(file); + return (EIO); + } + + /* Mask down to 32 bits like coreutils does. */ + *hostid = (value & HW_HOSTID_MASK); + kobj_close_file(file); + + return (0); +} + +/* + * Return the system hostid. Preferentially use the spl_hostid module option + * when set, otherwise use the value in the /etc/hostid file. + */ +uint32_t +zone_get_hostid(void *zone) +{ + uint32_t hostid; + + ASSERT3P(zone, ==, NULL); + + if (spl_hostid != 0) + return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); + + if (hostid_read(&hostid) == 0) + return (hostid); + + return (0); +} +EXPORT_SYMBOL(zone_get_hostid); + +static int +spl_kvmem_init(void) +{ + int rc = 0; + + rc = spl_kmem_init(); + if (rc) + return (rc); + + rc = spl_vmem_init(); + if (rc) { + spl_kmem_fini(); + return (rc); + } + + return (rc); +} + +/* + * We initialize the random number generator with 128 bits of entropy from the + * system random number generator. In the improbable case that we have a zero + * seed, we fallback to the system jiffies, unless it is also zero, in which + * situation we use a preprogrammed seed. We step forward by 2^64 iterations to + * initialize each of the per-cpu seeds so that the sequences generated on each + * CPU are guaranteed to never overlap in practice. + */ +static void __init +spl_random_init(void) +{ + uint64_t s[2]; + int i; + + get_random_bytes(s, sizeof (s)); + + if (s[0] == 0 && s[1] == 0) { + if (jiffies != 0) { + s[0] = jiffies; + s[1] = ~0 - jiffies; + } else { + (void) memcpy(s, "improbable seed", sizeof (s)); + } + printk("SPL: get_random_bytes() returned 0 " + "when generating random seed. Setting initial seed to " + "0x%016llx%016llx.", cpu_to_be64(s[0]), cpu_to_be64(s[1])); + } + + for_each_possible_cpu(i) { + uint64_t *wordp = per_cpu(spl_pseudo_entropy, i); + + spl_rand_jump(s); + + wordp[0] = s[0]; + wordp[1] = s[1]; + } +} + +static void +spl_kvmem_fini(void) +{ + spl_vmem_fini(); + spl_kmem_fini(); +} + +static int __init +spl_init(void) +{ + int rc = 0; + + bzero(&p0, sizeof (proc_t)); + spl_random_init(); + + if ((rc = spl_kvmem_init())) + goto out1; + + if ((rc = spl_mutex_init())) + goto out2; + + if ((rc = spl_rw_init())) + goto out3; + + if ((rc = spl_tsd_init())) + goto out4; + + if ((rc = spl_taskq_init())) + goto out5; + + if ((rc = spl_kmem_cache_init())) + goto out6; + + if ((rc = spl_vn_init())) + goto out7; + + if ((rc = spl_proc_init())) + goto out8; + + if ((rc = spl_kstat_init())) + goto out9; + + if ((rc = spl_zlib_init())) + goto out10; + + printk(KERN_NOTICE "SPL: Loaded module v%s-%s%s\n", SPL_META_VERSION, + SPL_META_RELEASE, SPL_DEBUG_STR); + return (rc); + +out10: + spl_kstat_fini(); +out9: + spl_proc_fini(); +out8: + spl_vn_fini(); +out7: + spl_kmem_cache_fini(); +out6: + spl_taskq_fini(); +out5: + spl_tsd_fini(); +out4: + spl_rw_fini(); +out3: + spl_mutex_fini(); +out2: + spl_kvmem_fini(); +out1: + printk(KERN_NOTICE "SPL: Failed to Load Solaris Porting Layer " + "v%s-%s%s, rc = %d\n", SPL_META_VERSION, SPL_META_RELEASE, + SPL_DEBUG_STR, rc); + + return (rc); +} + +static void __exit +spl_fini(void) +{ + printk(KERN_NOTICE "SPL: Unloaded module v%s-%s%s\n", + SPL_META_VERSION, SPL_META_RELEASE, SPL_DEBUG_STR); + spl_zlib_fini(); + spl_kstat_fini(); + spl_proc_fini(); + spl_vn_fini(); + spl_kmem_cache_fini(); + spl_taskq_fini(); + spl_tsd_fini(); + spl_rw_fini(); + spl_mutex_fini(); + spl_kvmem_fini(); +} + +module_init(spl_init); +module_exit(spl_fini); + +MODULE_DESCRIPTION("Solaris Porting Layer"); +MODULE_AUTHOR(SPL_META_AUTHOR); +MODULE_LICENSE(SPL_META_LICENSE); +MODULE_VERSION(SPL_META_VERSION "-" SPL_META_RELEASE); diff --git a/module/spl/spl-kmem-cache.c b/module/spl/spl-kmem-cache.c new file mode 100644 index 000000000..5492c6a46 --- /dev/null +++ b/module/spl/spl-kmem-cache.c @@ -0,0 +1,1769 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/shrinker.h> +#include <sys/taskq.h> +#include <sys/timer.h> +#include <sys/vmem.h> +#include <sys/wait.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/prefetch.h> + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + +/* + * Cache expiration was implemented because it was part of the default Solaris + * kmem_cache behavior. The idea is that per-cpu objects which haven't been + * accessed in several seconds should be returned to the cache. On the other + * hand Linux slabs never move objects back to the slabs unless there is + * memory pressure on the system. By default the Linux method is enabled + * because it has been shown to improve responsiveness on low memory systems. + * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; +EXPORT_SYMBOL(spl_kmem_cache_expire); +module_param(spl_kmem_cache_expire, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); + +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; +module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, + "Minimal number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +/* + * This value defaults to a threshold designed to avoid allocations which + * have been deemed costly by the kernel. + */ +unsigned int spl_kmem_cache_kmem_limit = + ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / + SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); +/* END CSTYLED */ + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data types which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for ageing / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); +SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, + spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); + } else { + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + } + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return (ptr); +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + free_pages((unsigned long)ptr, get_order(size)); + } else { + vfree(ptr); + } +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); +} + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each offslab object taking in to account alignment + * restrictions and the power-of-two requirement of kv_alloc(). + */ +static inline uint32_t +spl_offslab_size(spl_kmem_cache_t *skc) +{ + return (1UL << (fls64(spl_obj_size(skc)) + 1)); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * KMC_ONSLAB KMC_OFFSLAB + * + * +------------------------+ +-----------------+ + * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ + * | skc_obj_size <-+ | | +-----------------+ | | + * | spl_kmem_obj_t | | | | + * | skc_obj_size <---+ | +-----------------+ | | + * | spl_kmem_obj_t | | | skc_obj_size | <-+ | + * | ... v | | spl_kmem_obj_t | | + * +------------------------+ +-----------------+ v + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + void *base, *obj; + uint32_t obj_size, offslab_size = 0; + int i, rc = 0; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_OFFSLAB) + offslab_size = spl_offslab_size(skc); + + for (i = 0; i < sks->sks_objs; i++) { + if (skc->skc_flags & KMC_OFFSLAB) { + obj = kv_alloc(skc, offslab_size, flags); + if (!obj) { + rc = -ENOMEM; + goto out; + } + } else { + obj = base + spl_sks_size(skc) + (i * obj_size); + } + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + +out: + if (rc) { + if (skc->skc_flags & KMC_OFFSLAB) + list_for_each_entry_safe(sko, + n, &sks->sks_free_list, sko_list) { + kv_free(skc, sko->sko_addr, offslab_size); + } + + kv_free(skc, base, skc->skc_slab_size); + sks = NULL; + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Reclaim empty slabs at the end of the partial list. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + spl_kmem_obj_t *sko, *n; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + uint32_t size = 0; + + /* + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { + + if (sks->sks_ref > 0) + break; + + spl_slab_free(sks, &sks_list, &sko_list); + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are + * run, any offslab objects are freed, and the slabs themselves + * are freed. This is all done outside the skc->skc_lock since + * this allows the destructor to sleep, and allows us to perform + * a conditional reschedule when a freeing a large number of + * objects and slabs back to the system. + */ + if (skc->skc_flags & KMC_OFFSLAB) + size = spl_offslab_size(skc); + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + + if (skc->skc_flags & KMC_OFFSLAB) + kv_free(skc, sko->sko_addr, size); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < ske->ske_obj) + node = node->rb_left; + else if (address > ske->ske_obj) + node = node->rb_right; + else + return (ske); + } + + return (NULL); +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return (0); + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return (1); +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + gfp_t lflags = kmem_flags_convert(flags); + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof (*ske), lflags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = __get_free_pages(lflags, order); + if (ske->ske_obj == 0) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + free_pages(ske->ske_obj, order); + kfree(ske); + return (-EINVAL); + } + + *obj = (void *)ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (ske) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + return (-ENOENT); + + free_pages(ske->ske_obj, order); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + int i, count = MIN(flush, skm->skm_avail); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + for (i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof (void *) * skm->skm_avail); +} + +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + __spl_cache_flush(skc, skm, flush); + spin_unlock(&skc->skc_lock); +} + +static void +spl_magazine_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_cpu == smp_processor_id()); + ASSERT(irqs_disabled()); + + /* There are no available objects or they are too young to age out */ + if ((skm->skm_avail == 0) || + time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) + return; + + /* + * Because we're executing in interrupt context we may have + * interrupted the holder of this lock. To avoid a potential + * deadlock return if the lock is contended. + */ + if (!spin_trylock(&skc->skc_lock)) + return; + + __spl_cache_flush(skc, skm, skm->skm_refill); + spin_unlock(&skc->skc_lock); +} + +/* + * Called regularly to keep a downward pressure on the cache. + * + * Objects older than skc->skc_delay seconds in the per-cpu magazines will + * be returned to the caches. This is done to prevent idle magazines from + * holding memory which could be better used elsewhere. The delay is + * present to prevent thrashing the magazine. + * + * The newly released objects may result in empty partial slabs. Those + * slabs should be released to the system. Otherwise moving the objects + * out of the magazines is just wasted work. + */ +static void +spl_cache_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + taskqid_t id = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* Dynamically disabled at run time */ + if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) + return; + + atomic_inc(&skc->skc_ref); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + on_each_cpu(spl_magazine_age, skc, 1); + + spl_slab_reclaim(skc); + + while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { + id = taskq_dispatch_delay( + spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + /* Destroy issued after dispatch immediately cancel it */ + if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) + taskq_cancel_id(spl_kmem_cache_taskq, id); + } + + spin_lock(&skc->skc_lock); + skc->skc_taskqid = id; + spin_unlock(&skc->skc_lock); + + atomic_dec(&skc->skc_ref); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; + + if (skc->skc_flags & KMC_OFFSLAB) { + tgt_objs = spl_kmem_cache_obj_per_slab; + tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + + if ((skc->skc_flags & KMC_KMEM) && + (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) + return (-ENOSPC); + } else { + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); + + /* + * KMC_KMEM slabs are allocated by __get_free_pages() which + * rounds up to the nearest order. Knowing this the size + * should be rounded up to the next power of two with a hard + * maximum defined by the maximum allowed allocation order. + */ + if (skc->skc_flags & KMC_KMEM) { + max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; + tgt_size = MIN(max_size, + PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); + } + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } + } + + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; + + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_age = jiffies; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + kfree(skm); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return (0); + + skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * + num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_possible_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + kfree(skc->skc_mag); + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return; + + for_each_possible_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } + + kfree(skc->skc_mag); +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_NOTOUCH Disable cache object aging (unsupported) + * KMC_NODEBUG Disable debugging (unsupported) + * KMC_NOHASH Disable hashing (unsupported) + * KMC_QCACHE Disable qcache (unsupported) + * KMC_NOMAGAZINE Enabled for kmem/vmem, Disabled for Linux slab + * KMC_KMEM Force kmem backed cache + * KMC_VMEM Force vmem backed cache + * KMC_SLAB Force Linux slab backed cache + * KMC_OFFSLAB Locate objects off the slab + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + gfp_t lflags = kmem_flags_convert(KM_SLEEP); + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT0(flags & KMC_NOMAGAZINE); + ASSERT0(flags & KMC_NOHASH); + ASSERT0(flags & KMC_QCACHE); + ASSERT(vmp == NULL); + + might_sleep(); + + skc = kzalloc(sizeof (*skc), lflags); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + if (skc->skc_name == NULL) { + kfree(skc); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. By + * default this functionality is disabled until its + * performance characteristics are fully understood. + */ + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) + skc->skc_flags |= KMC_SLAB; + + /* + * Small objects, less than spl_kmem_cache_kmem_limit per + * object should use kmem because their slabs are small. + */ + else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) + skc->skc_flags |= KMC_KMEM; + + /* + * All other objects are considered large and are placed + * on vmem backed slabs. + */ + else + skc->skc_flags |= KMC_VMEM; + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + unsigned long slabflags = 0; + + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + +#if defined(SLAB_USERCOPY) + /* + * Required for PAX-enabled kernels if the slab is to be + * used for coping between user and kernel space. + */ + slabflags |= SLAB_USERCOPY; +#endif + +#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) + /* + * Newer grsec patchset uses kmem_cache_create_usercopy() + * instead of SLAB_USERCOPY flag + */ + skc->skc_linux_cache = kmem_cache_create_usercopy( + skc->skc_name, size, align, slabflags, 0, size, NULL); +#else + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, slabflags, NULL); +#endif + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif + skc->skc_flags |= KMC_NOMAGAZINE; + } + + if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) + skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, + spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kfree(skc->skc_name); + kfree(skc); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* + * Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. + */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + spin_unlock(&skc->skc_lock); + + kfree(skc->skc_name); + kfree(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return (sko->sko_addr); +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static int +__spl_cache_grow(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + + fstrans_cookie_t cookie = spl_fstrans_mark(); + sks = spl_slab_alloc(skc, flags); + spl_fstrans_unmark(cookie); + + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + + smp_mb__before_atomic(); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + } + spin_unlock(&skc->skc_lock); + + return (sks == NULL ? -ENOMEM : 0); +} + +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + + (void) __spl_cache_grow(skc, ska->ska_flags); + + atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * To reduce the overhead of context switch and improve NUMA locality, + * it tries to allocate a new slab in the current process context with + * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the + * allocation. + * + * However, this can't be applied to KVM_VMEM due to a bug that + * __vmalloc() doesn't honor gfp flags in page table allocation. + */ + if (!(skc->skc_flags & KMC_VMEM)) { + rc = __spl_cache_grow(skc, flags | KM_NOSLEEP); + if (rc == 0) + return (0); + } + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); + if (ska == NULL) { + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ / 10); + + if (!remaining) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* + * Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. + */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* + * Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. + */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + do { + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + goto ret; + } + + local_irq_disable(); + +restart: + /* + * Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + skm->skm_age = jiffies; + } else { + obj = spl_cache_refill(skc, skm, flags); + if ((obj == NULL) && !(flags & KM_NOSLEEP)) + goto restart; + + local_irq_enable(); + goto ret; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + return (obj); +} +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + int do_reclaim = 0; + int do_emergency = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + return; + } + + /* + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. + */ + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + return; + } + + local_irq_save(flags); + + /* + * Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { + spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * The generic shrinker function for all caches. Under Linux a shrinker + * may not be tightly coupled with a slab cache. In fact Linux always + * systematically tries calling all registered shrinker callbacks which + * report that they contain unused objects. Because of this we only + * register one shrinker function in the shim layer for all slab caches. + * We always attempt to shrink all caches when this generic shrinker + * is called. + * + * If sc->nr_to_scan is zero, the caller is requesting a query of the + * number of objects which can potentially be freed. If it is nonzero, + * the request is to free that many objects. + * + * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks + * in struct shrinker and also require the shrinker to return the number + * of objects freed. + * + * Older kernels require the shrinker to return the number of freeable + * objects following the freeing of nr_to_free. + * + * Linux semantics differ from those under Solaris, which are to + * free all available objects which may (and probably will) be more + * objects than the requested nr_to_scan. + */ +static spl_shrinker_t +__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + spl_kmem_cache_t *skc; + int alloc = 0; + + /* + * No shrinking in a transaction context. Can cause deadlocks. + */ + if (sc->nr_to_scan && spl_fstrans_check()) + return (SHRINK_STOP); + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (sc->nr_to_scan) { +#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + uint64_t oldalloc = skc->skc_obj_alloc; + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + if (oldalloc > skc->skc_obj_alloc) + alloc += oldalloc - skc->skc_obj_alloc; +#else + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + alloc += skc->skc_obj_alloc; +#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ + } else { + /* Request to query number of freeable objects */ + alloc += skc->skc_obj_alloc; + } + } + up_read(&spl_kmem_cache_sem); + + /* + * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. + * This functionality only exists to work around a rare issue where + * shrink_slabs() is repeatedly invoked by many cores causing the + * system to thrash. + */ + if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) + return (SHRINK_STOP); + + return (MAX(alloc, 0)); +} + +SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); + +/* + * Call the registered reclaim function for a cache. Depending on how + * many and which objects are released it may simply repopulate the + * local magazine which will then need to age-out. Objects which cannot + * fit in the magazine we will be released back to their slabs which will + * also need to age out before being release. This is all just best + * effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + goto out; + } + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* + * When a reclaim function is available it may be invoked repeatedly + * until at least a single slab can be freed. This ensures that we + * do free memory back to the system. This helps minimize the chance + * of an OOM event when the bulk of memory is used by the slab. + * + * When free slabs are already available the reclaim callback will be + * skipped. Additionally, if no forward progress is detected despite + * a reclaim function the cache will be skipped to avoid deadlock. + * + * Longer term this would be the correct place to add the code which + * repacks the slabs in order minimize fragmentation. + */ + if (skc->skc_reclaim) { + uint64_t objects = UINT64_MAX; + int do_reclaim; + + do { + spin_lock(&skc->skc_lock); + do_reclaim = + (skc->skc_slab_total > 0) && + ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && + (skc->skc_obj_alloc < objects); + + objects = skc->skc_obj_alloc; + spin_unlock(&skc->skc_lock); + + if (do_reclaim) + skc->skc_reclaim(skc->skc_private); + + } while (do_reclaim); + } + + /* Reclaim from the magazine and free all now empty slabs. */ + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { + spl_kmem_magazine_t *skm; + unsigned long irq_flags; + + local_irq_save(irq_flags); + skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + } + + spl_slab_reclaim(skc); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + struct shrink_control sc; + + sc.nr_to_scan = KMC_REAP_CHUNK; + sc.gfp_mask = GFP_KERNEL; + + (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + spl_kmem_cache_kmem_threads, maxclsyspri, + spl_kmem_cache_kmem_threads * 8, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + spl_register_shrinker(&spl_kmem_cache_shrinker); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + spl_unregister_shrinker(&spl_kmem_cache_shrinker); + taskq_destroy(spl_kmem_cache_taskq); +} diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c new file mode 100644 index 000000000..e0d551041 --- /dev/null +++ b/module/spl/spl-kmem.c @@ -0,0 +1,567 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/vmem.h> +#include <linux/mm.h> +#include <linux/ratelimit.h> + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to sixteen pages but capped at 64K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); +/* END CSTYLED */ + +int +kmem_debugging(void) +{ + return (0); +} +EXPORT_SYMBOL(kmem_debugging); + +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr; + + do { + va_copy(aq, ap); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); + va_end(aq); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_vasprintf); + +char * +kmem_asprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_asprintf); + +static char * +__strdup(const char *str, int flags) +{ + char *ptr; + int n; + + n = strlen(str); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); + if (ptr) + memcpy(ptr, str, n + 1); + + return (ptr); +} + +char * +strdup(const char *str) +{ + return (__strdup(str, KM_SLEEP)); +} +EXPORT_SYMBOL(strdup); + +void +strfree(char *str) +{ + kfree(str); +} +EXPORT_SYMBOL(strfree); + +/* + * Limit the number of large allocation stack traces dumped to not more than + * 5 every 60 seconds to prevent denial-of-service attacks from debug code. + */ +DEFINE_RATELIMIT_STATE(kmem_alloc_ratelimit_state, 60 * HZ, 5); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + int use_vmem = 0; + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM) && __ratelimit(&kmem_alloc_ratelimit_state)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if ((size > spl_kmem_alloc_max) || use_vmem) { + if (flags & KM_VMEM) { + ptr = __vmalloc(size, lflags, PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + /* + * For vmem_alloc() and vmem_zalloc() callers retry immediately + * using __vmalloc() which is unlikely to fail. + */ + if ((flags & KM_VMEM) && (use_vmem == 0)) { + use_vmem = 1; + continue; + } + + if (unlikely(__ratelimit(&kmem_alloc_ratelimit_state))) { + printk(KERN_WARNING + "Possible memory allocation deadlock: " + "size=%lu lflags=0x%x", + (unsigned long)size, lflags); + dump_stack(); + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem + */ +#ifdef DEBUG_KMEM + +/* Shim layer memory accounting */ +#ifdef HAVE_ATOMIC64_T +atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); +unsigned long long kmem_alloc_max = 0; +#else /* HAVE_ATOMIC64_T */ +atomic_t kmem_alloc_used = ATOMIC_INIT(0); +unsigned long long kmem_alloc_max = 0; +#endif /* HAVE_ATOMIC64_T */ + +EXPORT_SYMBOL(kmem_alloc_used); +EXPORT_SYMBOL(kmem_alloc_max); + +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked + * but also the location of every alloc and free. When the SPL module is + * unloaded a list of all leaked addresses and where they were allocated + * will be dumped to the console. Enabling this feature has a significant + * impact on performance but it makes finding memory leaks straight forward. + * + * Not surprisingly with debugging enabled the xmem_locks are very highly + * contended particularly on xfree(). If we want to run with this detailed + * debugging enabled for anything other than debugging we need to minimize + * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking + */ +#ifdef DEBUG_KMEM_TRACKING + +#include <linux/hash.h> +#include <linux/ctype.h> + +#define KMEM_HASH_BITS 10 +#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) + +typedef struct kmem_debug { + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ +} kmem_debug_t; + +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; + +static kmem_debug_t * +kmem_del_init(spinlock_t *lock, struct hlist_head *table, + int bits, const void *addr) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kmem_debug *p; + unsigned long flags; + + spin_lock_irqsave(lock, flags); + + head = &table[hash_ptr((void *)addr, bits)]; + hlist_for_each(node, head) { + p = list_entry(node, struct kmem_debug, kd_hlist); + if (p->kd_addr == addr) { + hlist_del_init(&p->kd_hlist); + list_del_init(&p->kd_list); + spin_unlock_irqrestore(lock, flags); + return (p); + } + } + + spin_unlock_irqrestore(lock, flags); + + return (NULL); +} + +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) +{ + void *ptr = NULL; + kmem_debug_t *dptr; + unsigned long irq_flags; + + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); + + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } + + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } + + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); + + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; + + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); + + return (ptr); +} + +inline void +spl_kmem_free_track(const void *ptr, size_t size) +{ + kmem_debug_t *dptr; + + /* Ignore NULL pointer since we haven't tracked it at all */ + if (ptr == NULL) + return; + + /* Must exist in hash due to kmem_alloc() */ + dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); + + kfree(dptr->kd_func); + kfree(dptr); + + spl_kmem_free_debug(ptr, size); +} +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ +void * +spl_kmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); + +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_ZERO; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_zalloc); + +void +spl_kmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_kmem_free); + +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) +{ + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; + + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); + + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ + if (i > min) + break; + + flag = 0; + break; + } + } + + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } + + return (str); +} + +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) +{ + int i; + + spin_lock_init(lock); + INIT_LIST_HEAD(list); + + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); + + return (0); +} + +static void +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +{ + unsigned long flags; + kmem_debug_t *kd; + char str[17]; + + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); + + list_for_each_entry(kd, list, kd_list) { + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + } + + spin_unlock_irqrestore(lock, flags); +} +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ + +int +spl_kmem_init(void) +{ +#ifdef DEBUG_KMEM + kmem_alloc_used_set(0); + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + + return (0); +} + +void +spl_kmem_fini(void) +{ +#ifdef DEBUG_KMEM + /* + * Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. + */ + if (kmem_alloc_used_read() != 0) + printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +} diff --git a/module/spl/spl-kobj.c b/module/spl/spl-kobj.c new file mode 100644 index 000000000..7019369bd --- /dev/null +++ b/module/spl/spl-kobj.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Kobj Implementation. + */ + +#include <sys/kobj.h> + +struct _buf * +kobj_open_file(const char *name) +{ + struct _buf *file; + vnode_t *vp; + int rc; + + file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP)); + if (file == NULL) + return ((_buf_t *)-1UL); + + if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) { + kfree(file); + return ((_buf_t *)-1UL); + } + + file->vp = vp; + + return (file); +} /* kobj_open_file() */ +EXPORT_SYMBOL(kobj_open_file); + +void +kobj_close_file(struct _buf *file) +{ + VOP_CLOSE(file->vp, 0, 0, 0, 0, 0); + kfree(file); +} /* kobj_close_file() */ +EXPORT_SYMBOL(kobj_close_file); + +int +kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) +{ + ssize_t resid; + + if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off, + UIO_SYSSPACE, 0, 0, 0, &resid) != 0) + return (-1); + + return (size - resid); +} /* kobj_read_file() */ +EXPORT_SYMBOL(kobj_read_file); + +int +kobj_get_filesize(struct _buf *file, uint64_t *size) +{ + vattr_t vap; + int rc; + + rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL); + if (rc) + return (rc); + + *size = vap.va_size; + + return (rc); +} /* kobj_get_filesize() */ +EXPORT_SYMBOL(kobj_get_filesize); diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c new file mode 100644 index 000000000..bcbff94a6 --- /dev/null +++ b/module/spl/spl-kstat.c @@ -0,0 +1,733 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Kstat Implementation. + */ + +#include <linux/seq_file.h> +#include <sys/kstat.h> +#include <sys/vmem.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> + +#ifndef HAVE_PDE_DATA +#define PDE_DATA(x) (PDE(x)->data) +#endif + +static kmutex_t kstat_module_lock; +static struct list_head kstat_module_list; +static kid_t kstat_id; + +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + + return (0); +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} +EXPORT_SYMBOL(kstat_waitq_enter); + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} +EXPORT_SYMBOL(kstat_waitq_exit); + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} +EXPORT_SYMBOL(kstat_runq_enter); + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} +EXPORT_SYMBOL(kstat_runq_exit); + +static int +kstat_seq_show_headers(struct seq_file *f) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", + ksp->ks_kid, ksp->ks_type, ksp->ks_flags, + ksp->ks_ndata, (int)ksp->ks_data_size, + ksp->ks_crtime, ksp->ks_snaptime); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + seq_printf(f, "raw data\n"); + } + break; + case KSTAT_TYPE_NAMED: + seq_printf(f, "%-31s %-4s %s\n", + "name", "type", "data"); + break; + case KSTAT_TYPE_INTR: + seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", + "hard", "soft", "watchdog", + "spurious", "multsvc"); + break; + case KSTAT_TYPE_IO: + seq_printf(f, + "%-8s %-8s %-8s %-8s %-8s %-8s " + "%-8s %-8s %-8s %-8s %-8s %-8s\n", + "nread", "nwritten", "reads", "writes", + "wtime", "wlentime", "wupdate", + "rtime", "rlentime", "rupdate", + "wcnt", "rcnt"); + break; + case KSTAT_TYPE_TIMER: + seq_printf(f, + "%-31s %-8s " + "%-8s %-8s %-8s %-8s %-8s\n", + "name", "events", "elapsed", + "min", "max", "start", "stop"); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) +{ + int i, j; + + for (i = 0; ; i++) { + seq_printf(f, "%03x:", i); + + for (j = 0; j < 16; j++) { + if (i * 16 + j >= l) { + seq_printf(f, "\n"); + goto out; + } + + seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); + } + seq_printf(f, "\n"); + } +out: + return (0); +} + +static int +kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) +{ + seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); + + switch (knp->data_type) { + case KSTAT_DATA_CHAR: + knp->value.c[15] = '\0'; /* NULL terminate */ + seq_printf(f, "%-16s", knp->value.c); + break; + /* + * NOTE - We need to be more careful able what tokens are + * used for each arch, for now this is correct for x86_64. + */ + case KSTAT_DATA_INT32: + seq_printf(f, "%d", knp->value.i32); + break; + case KSTAT_DATA_UINT32: + seq_printf(f, "%u", knp->value.ui32); + break; + case KSTAT_DATA_INT64: + seq_printf(f, "%lld", (signed long long)knp->value.i64); + break; + case KSTAT_DATA_UINT64: + seq_printf(f, "%llu", + (unsigned long long)knp->value.ui64); + break; + case KSTAT_DATA_LONG: + seq_printf(f, "%ld", knp->value.l); + break; + case KSTAT_DATA_ULONG: + seq_printf(f, "%lu", knp->value.ul); + break; + case KSTAT_DATA_STRING: + KSTAT_NAMED_STR_PTR(knp) + [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; + seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); + break; + default: + PANIC("Undefined kstat data type %d\n", knp->data_type); + } + + seq_printf(f, "\n"); + + return (0); +} + +static int +kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) +{ + seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", + kip->intrs[KSTAT_INTR_HARD], + kip->intrs[KSTAT_INTR_SOFT], + kip->intrs[KSTAT_INTR_WATCHDOG], + kip->intrs[KSTAT_INTR_SPURIOUS], + kip->intrs[KSTAT_INTR_MULTSVC]); + + return (0); +} + +static int +kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) +{ + seq_printf(f, + "%-8llu %-8llu %-8u %-8u %-8lld %-8lld " + "%-8lld %-8lld %-8lld %-8lld %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + + return (0); +} + +static int +kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) +{ + seq_printf(f, + "%-31s %-8llu %-8lld %-8lld %-8lld %-8lld %-8lld\n", + ktp->name, ktp->num_events, ktp->elapsed_time, + ktp->min_time, ktp->max_time, + ktp->start_time, ktp->stop_time); + + return (0); +} + +static int +kstat_seq_show(struct seq_file *f, void *p) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data( + ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + ASSERT(ksp->ks_ndata == 1); + rc = kstat_seq_show_raw(f, ksp->ks_data, + ksp->ks_data_size); + } + break; + case KSTAT_TYPE_NAMED: + rc = kstat_seq_show_named(f, (kstat_named_t *)p); + break; + case KSTAT_TYPE_INTR: + rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); + break; + case KSTAT_TYPE_IO: + rc = kstat_seq_show_io(f, (kstat_io_t *)p); + break; + case KSTAT_TYPE_TIMER: + rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +static void * +kstat_seq_data_addr(kstat_t *ksp, loff_t n) +{ + void *rc = NULL; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + if (ksp->ks_raw_ops.addr) + rc = ksp->ks_raw_ops.addr(ksp, n); + else + rc = ksp->ks_data; + break; + case KSTAT_TYPE_NAMED: + rc = ksp->ks_data + n * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + rc = ksp->ks_data + n * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + rc = ksp->ks_data + n * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + rc = ksp->ks_data + n * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (rc); +} + +static void * +kstat_seq_start(struct seq_file *f, loff_t *pos) +{ + loff_t n = *pos; + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + + if (ksp->ks_type == KSTAT_TYPE_RAW) { + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + } + + /* Dynamically update kstat, on error existing kstats are used */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_snaptime = gethrtime(); + + if (!n && kstat_seq_show_headers(f)) + return (NULL); + + if (n >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, n)); +} + +static void * +kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + ++*pos; + if (*pos >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, *pos)); +} + +static void +kstat_seq_stop(struct seq_file *f, void *v) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + if (ksp->ks_type == KSTAT_TYPE_RAW) + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + + mutex_exit(ksp->ks_lock); +} + +static struct seq_operations kstat_seq_ops = { + .show = kstat_seq_show, + .start = kstat_seq_start, + .next = kstat_seq_next, + .stop = kstat_seq_stop, +}; + +static kstat_module_t * +kstat_find_module(char *name) +{ + kstat_module_t *module; + + list_for_each_entry(module, &kstat_module_list, ksm_module_list) { + if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) + return (module); + } + + return (NULL); +} + +static kstat_module_t * +kstat_create_module(char *name) +{ + kstat_module_t *module; + struct proc_dir_entry *pde; + + pde = proc_mkdir(name, proc_spl_kstat); + if (pde == NULL) + return (NULL); + + module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); + module->ksm_proc = pde; + strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); + INIT_LIST_HEAD(&module->ksm_kstat_list); + list_add_tail(&module->ksm_module_list, &kstat_module_list); + + return (module); + +} + +static void +kstat_delete_module(kstat_module_t *module) +{ + ASSERT(list_empty(&module->ksm_kstat_list)); + remove_proc_entry(module->ksm_name, proc_spl_kstat); + list_del(&module->ksm_module_list); + kmem_free(module, sizeof (kstat_module_t)); +} + +static int +proc_kstat_open(struct inode *inode, struct file *filp) +{ + struct seq_file *f; + int rc; + + rc = seq_open(filp, &kstat_seq_ops); + if (rc) + return (rc); + + f = filp->private_data; + f->private = PDE_DATA(inode); + + return (rc); +} + +static ssize_t +proc_kstat_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + kstat_t *ksp = f->private; + int rc; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + rc = ksp->ks_update(ksp, KSTAT_WRITE); + mutex_exit(ksp->ks_lock); + + if (rc) + return (-rc); + + *ppos += len; + return (len); +} + +static struct file_operations proc_kstat_operations = { + .open = proc_kstat_open, + .write = proc_kstat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} +EXPORT_SYMBOL(__kstat_set_raw_ops); + +kstat_t * +__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, uint_t ks_ndata, + uchar_t ks_flags) +{ + kstat_t *ksp; + + ASSERT(ks_module); + ASSERT(ks_instance == 0); + ASSERT(ks_name); + ASSERT(!(ks_flags & KSTAT_FLAG_UNSUPPORTED)); + + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); + if (ksp == NULL) + return (ksp); + + mutex_enter(&kstat_module_lock); + ksp->ks_kid = kstat_id; + kstat_id++; + mutex_exit(&kstat_module_lock); + + ksp->ks_magic = KS_MAGIC; + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + INIT_LIST_HEAD(&ksp->ks_list); + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN); + ksp->ks_instance = ks_instance; + strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN); + strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = ks_flags; + ksp->ks_update = kstat_default_update; + ksp->ks_private = NULL; + ksp->ks_raw_ops.headers = NULL; + ksp->ks_raw_ops.data = NULL; + ksp->ks_raw_ops.addr = NULL; + ksp->ks_raw_buf = NULL; + ksp->ks_raw_bufsize = 0; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + + return (ksp); +} +EXPORT_SYMBOL(__kstat_create); + +static int +kstat_detect_collision(kstat_t *ksp) +{ + kstat_module_t *module; + kstat_t *tmp; + char *parent; + char *cp; + + parent = kmem_asprintf("%s", ksp->ks_module); + + if ((cp = strrchr(parent, '/')) == NULL) { + strfree(parent); + return (0); + } + + cp[0] = '\0'; + if ((module = kstat_find_module(parent)) != NULL) { + list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) { + if (strncmp(tmp->ks_name, cp+1, KSTAT_STRLEN) == 0) { + strfree(parent); + return (EEXIST); + } + } + } + + strfree(parent); + return (0); +} + +void +__kstat_install(kstat_t *ksp) +{ + kstat_module_t *module; + kstat_t *tmp; + + ASSERT(ksp); + + mutex_enter(&kstat_module_lock); + + module = kstat_find_module(ksp->ks_module); + if (module == NULL) { + if (kstat_detect_collision(ksp) != 0) { + cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ + " collision", ksp->ks_module, ksp->ks_name); + goto out; + } + module = kstat_create_module(ksp->ks_module); + if (module == NULL) + goto out; + } + + /* + * Only one entry by this name per-module, on failure the module + * shouldn't be deleted because we know it has at least one entry. + */ + list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) { + if (strncmp(tmp->ks_name, ksp->ks_name, KSTAT_STRLEN) == 0) + goto out; + } + + list_add_tail(&ksp->ks_list, &module->ksm_kstat_list); + + mutex_enter(ksp->ks_lock); + ksp->ks_owner = module; + ksp->ks_proc = proc_create_data(ksp->ks_name, 0644, + module->ksm_proc, &proc_kstat_operations, (void *)ksp); + if (ksp->ks_proc == NULL) { + list_del_init(&ksp->ks_list); + if (list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + } + mutex_exit(ksp->ks_lock); +out: + mutex_exit(&kstat_module_lock); +} +EXPORT_SYMBOL(__kstat_install); + +void +__kstat_delete(kstat_t *ksp) +{ + kstat_module_t *module = ksp->ks_owner; + + mutex_enter(&kstat_module_lock); + list_del_init(&ksp->ks_list); + mutex_exit(&kstat_module_lock); + + if (ksp->ks_proc) { + remove_proc_entry(ksp->ks_name, module->ksm_proc); + + /* Remove top level module directory if it's empty */ + if (list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + } + + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); + + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); + kmem_free(ksp, sizeof (*ksp)); +} +EXPORT_SYMBOL(__kstat_delete); + +int +spl_kstat_init(void) +{ + mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&kstat_module_list); + kstat_id = 0; + return (0); +} + +void +spl_kstat_fini(void) +{ + ASSERT(list_empty(&kstat_module_list)); + mutex_destroy(&kstat_module_lock); +} diff --git a/module/spl/spl-mutex.c b/module/spl/spl-mutex.c new file mode 100644 index 000000000..ba818862b --- /dev/null +++ b/module/spl/spl-mutex.c @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Mutex Implementation. + */ + +#include <sys/mutex.h> + +int spl_mutex_init(void) { return 0; } +void spl_mutex_fini(void) { } diff --git a/module/spl/spl-proc.c b/module/spl/spl-proc.c new file mode 100644 index 000000000..9c52924a4 --- /dev/null +++ b/module/spl/spl-proc.c @@ -0,0 +1,782 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Proc Implementation. + */ + +#include <sys/systeminfo.h> +#include <sys/kstat.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/vmem.h> +#include <sys/taskq.h> +#include <sys/proc.h> +#include <linux/ctype.h> +#include <linux/kmod.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/version.h> + +#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +typedef struct ctl_table __no_const spl_ctl_table; +#else +typedef struct ctl_table spl_ctl_table; +#endif + +static unsigned long table_min = 0; +static unsigned long table_max = ~0; + +static struct ctl_table_header *spl_header = NULL; +static struct proc_dir_entry *proc_spl = NULL; +static struct proc_dir_entry *proc_spl_kmem = NULL; +static struct proc_dir_entry *proc_spl_kmem_slab = NULL; +static struct proc_dir_entry *proc_spl_taskq_all = NULL; +static struct proc_dir_entry *proc_spl_taskq = NULL; +struct proc_dir_entry *proc_spl_kstat = NULL; + +static int +proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer, + int ubuffer_size) +{ + int size; + + if (ubuffer_size > kbuffer_size) + return (-EOVERFLOW); + + if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) + return (-EFAULT); + + /* strip trailing whitespace */ + size = strnlen(kbuffer, ubuffer_size); + while (size-- >= 0) + if (!isspace(kbuffer[size])) + break; + + /* empty string */ + if (size < 0) + return (-EINVAL); + + /* no space to terminate */ + if (size == kbuffer_size) + return (-EOVERFLOW); + + kbuffer[size + 1] = 0; + return (0); +} + +static int +proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer, + char *append) +{ + /* + * NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and + * (i.e. a terminating zero byte) for sysctl entries + */ + int size = MIN(strlen(kbuffer), ubuffer_size); + + if (copy_to_user(ubuffer, kbuffer, size)) + return (-EFAULT); + + if (append != NULL && size < ubuffer_size) { + if (copy_to_user(ubuffer + size, append, 1)) + return (-EFAULT); + + size++; + } + + return (size); +} + +#ifdef DEBUG_KMEM +static int +proc_domemused(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val; + spl_ctl_table dummy = *table; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { +#ifdef HAVE_ATOMIC64_T + val = atomic64_read((atomic64_t *)table->data); +#else + val = atomic_read((atomic_t *)table->data); +#endif /* HAVE_ATOMIC64_T */ + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} +#endif /* DEBUG_KMEM */ + +static int +proc_doslab(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val = 0, mask; + spl_ctl_table dummy = *table; + spl_kmem_cache_t *skc; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { + down_read(&spl_kmem_cache_sem); + mask = (unsigned long)table->data; + + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + + /* Only use slabs of the correct kmem/vmem type */ + if (!(skc->skc_flags & mask)) + continue; + + /* Sum the specified field for selected slabs */ + switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { + case KMC_TOTAL: + val += skc->skc_slab_size * skc->skc_slab_total; + break; + case KMC_ALLOC: + val += skc->skc_obj_size * skc->skc_obj_alloc; + break; + case KMC_MAX: + val += skc->skc_obj_size * skc->skc_obj_max; + break; + } + } + + up_read(&spl_kmem_cache_sem); + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} + +static int +proc_dohostid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int len, rc = 0; + char *end, str[32]; + + if (write) { + /* + * We can't use proc_doulongvec_minmax() in the write + * case here because hostid while a hex value has no + * leading 0x which confuses the helper function. + */ + rc = proc_copyin_string(str, sizeof (str), buffer, *lenp); + if (rc < 0) + return (rc); + + spl_hostid = simple_strtoul(str, &end, 16); + if (str == end) + return (-EINVAL); + + } else { + len = snprintf(str, sizeof (str), "%lx", + (unsigned long) zone_get_hostid(NULL)); + if (*ppos >= len) + rc = 0; + else + rc = proc_copyout_string(buffer, + *lenp, str + *ppos, "\n"); + + if (rc >= 0) { + *lenp = rc; + *ppos += rc; + } + } + + return (rc); +} + +static void +taskq_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", + "taskq", "act", "nthr", "spwn", "maxt", "pri", + "mina", "maxa", "cura", "flags"); +} + +/* indices into the lheads array below */ +#define LHEAD_PEND 0 +#define LHEAD_PRIO 1 +#define LHEAD_DELAY 2 +#define LHEAD_WAIT 3 +#define LHEAD_ACTIVE 4 +#define LHEAD_SIZE 5 + +/* BEGIN CSTYLED */ +static unsigned int spl_max_show_tasks = 512; +module_param(spl_max_show_tasks, uint, 0644); +MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); +/* END CSTYLED */ + +static int +taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) +{ + taskq_t *tq = p; + taskq_thread_t *tqt; + spl_wait_queue_entry_t *wq; + struct task_struct *tsk; + taskq_ent_t *tqe; + char name[100]; + struct list_head *lheads[LHEAD_SIZE], *lh; + static char *list_names[LHEAD_SIZE] = + {"pend", "prio", "delay", "wait", "active" }; + int i, j, have_lheads = 0; + unsigned long wflags, flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); + + /* get the various lists and check whether they're empty */ + lheads[LHEAD_PEND] = &tq->tq_pend_list; + lheads[LHEAD_PRIO] = &tq->tq_prio_list; + lheads[LHEAD_DELAY] = &tq->tq_delay_list; +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; +#else + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; +#endif + lheads[LHEAD_ACTIVE] = &tq->tq_active_list; + + for (i = 0; i < LHEAD_SIZE; ++i) { + if (list_empty(lheads[i])) + lheads[i] = NULL; + else + ++have_lheads; + } + + /* early return in non-"all" mode if lists are all empty */ + if (!allflag && !have_lheads) { + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); + } + + /* unlock the waitq quickly */ + if (!lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + + /* show the base taskq contents */ + snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); + seq_printf(f, "%-25s ", name); + seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", + tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, + tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, + tq->tq_nalloc, tq->tq_flags); + + /* show the active list */ + if (lheads[LHEAD_ACTIVE]) { + j = 0; + list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { + if (j == 0) + seq_printf(f, "\t%s:", + list_names[LHEAD_ACTIVE]); + else if (j == 2) { + seq_printf(f, "\n\t "); + j = 0; + } + seq_printf(f, " [%d]%pf(%ps)", + tqt->tqt_thread->pid, + tqt->tqt_task->tqent_func, + tqt->tqt_task->tqent_arg); + ++j; + } + seq_printf(f, "\n"); + } + + for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) + if (lheads[i]) { + j = 0; + list_for_each(lh, lheads[i]) { + if (spl_max_show_tasks != 0 && + j >= spl_max_show_tasks) { + seq_printf(f, "\n\t(truncated)"); + break; + } + /* show the wait waitq list */ + if (i == LHEAD_WAIT) { +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + wq = list_entry(lh, + spl_wait_queue_entry_t, entry); +#else + wq = list_entry(lh, + spl_wait_queue_entry_t, task_list); +#endif + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 8 == 0) + seq_printf(f, "\n\t "); + + tsk = wq->private; + seq_printf(f, " %d", tsk->pid); + /* pend, prio and delay lists */ + } else { + tqe = list_entry(lh, taskq_ent_t, + tqent_list); + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 2 == 0) + seq_printf(f, "\n\t "); + + seq_printf(f, " %pf(%ps)", + tqe->tqent_func, + tqe->tqent_arg); + } + ++j; + } + seq_printf(f, "\n"); + } + if (lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (0); +} + +static int +taskq_all_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_TRUE)); +} + +static int +taskq_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_FALSE)); +} + +static void * +taskq_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&tq_list_sem); + if (!n) + taskq_seq_show_headers(f); + + p = tq_list.next; + while (n--) { + p = p->next; + if (p == &tq_list) + return (NULL); + } + + return (list_entry(p, taskq_t, tq_taskqs)); +} + +static void * +taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + taskq_t *tq = p; + + ++*pos; + return ((tq->tq_taskqs.next == &tq_list) ? + NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); +} + +static void +slab_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, + "--------------------- cache ----------" + "--------------------------------------------- " + "----- slab ------ " + "---- object ----- " + "--- emergency ---\n"); + seq_printf(f, + "name " + " flags size alloc slabsize objsize " + "total alloc max " + "total alloc max " + "dlock alloc max\n"); +} + +static int +slab_seq_show(struct seq_file *f, void *p) +{ + spl_kmem_cache_t *skc = p; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Backed by Linux slab see /proc/slabinfo. + */ + if (skc->skc_flags & KMC_SLAB) + return (0); + + spin_lock(&skc->skc_lock); + seq_printf(f, "%-36s ", skc->skc_name); + seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " + "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", + (long unsigned)skc->skc_flags, + (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), + (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), + (unsigned)skc->skc_slab_size, + (unsigned)skc->skc_obj_size, + (long unsigned)skc->skc_slab_total, + (long unsigned)skc->skc_slab_alloc, + (long unsigned)skc->skc_slab_max, + (long unsigned)skc->skc_obj_total, + (long unsigned)skc->skc_obj_alloc, + (long unsigned)skc->skc_obj_max, + (long unsigned)skc->skc_obj_deadlock, + (long unsigned)skc->skc_obj_emergency, + (long unsigned)skc->skc_obj_emergency_max); + + spin_unlock(&skc->skc_lock); + + return (0); +} + +static void * +slab_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&spl_kmem_cache_sem); + if (!n) + slab_seq_show_headers(f); + + p = spl_kmem_cache_list.next; + while (n--) { + p = p->next; + if (p == &spl_kmem_cache_list) + return (NULL); + } + + return (list_entry(p, spl_kmem_cache_t, skc_list)); +} + +static void * +slab_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + spl_kmem_cache_t *skc = p; + + ++*pos; + return ((skc->skc_list.next == &spl_kmem_cache_list) ? + NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list)); +} + +static void +slab_seq_stop(struct seq_file *f, void *v) +{ + up_read(&spl_kmem_cache_sem); +} + +static struct seq_operations slab_seq_ops = { + .show = slab_seq_show, + .start = slab_seq_start, + .next = slab_seq_next, + .stop = slab_seq_stop, +}; + +static int +proc_slab_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &slab_seq_ops)); +} + +static struct file_operations proc_slab_operations = { + .open = proc_slab_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void +taskq_seq_stop(struct seq_file *f, void *v) +{ + up_read(&tq_list_sem); +} + +static struct seq_operations taskq_all_seq_ops = { + .show = taskq_all_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static struct seq_operations taskq_seq_ops = { + .show = taskq_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static int +proc_taskq_all_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_all_seq_ops)); +} + +static int +proc_taskq_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_seq_ops)); +} + +static struct file_operations proc_taskq_all_operations = { + .open = proc_taskq_all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations proc_taskq_operations = { + .open = proc_taskq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct ctl_table spl_kmem_table[] = { +#ifdef DEBUG_KMEM + { + .procname = "kmem_used", + .data = &kmem_alloc_used, +#ifdef HAVE_ATOMIC64_T + .maxlen = sizeof (atomic64_t), +#else + .maxlen = sizeof (atomic_t), +#endif /* HAVE_ATOMIC64_T */ + .mode = 0444, + .proc_handler = &proc_domemused, + }, + { + .procname = "kmem_max", + .data = &kmem_alloc_max, + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif /* DEBUG_KMEM */ + { + .procname = "slab_kmem_total", + .data = (void *)(KMC_KMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kmem_alloc", + .data = (void *)(KMC_KMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kmem_max", + .data = (void *)(KMC_KMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_total", + .data = (void *)(KMC_VMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_alloc", + .data = (void *)(KMC_VMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_max", + .data = (void *)(KMC_VMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + {}, +}; + +static struct ctl_table spl_kstat_table[] = { + {}, +}; + +static struct ctl_table spl_table[] = { + /* + * NB No .strategy entries have been provided since + * sysctl(8) prefers to go via /proc for portability. + */ + { + .procname = "version", + .data = spl_version, + .maxlen = sizeof (spl_version), + .mode = 0444, + .proc_handler = &proc_dostring, + }, + { + .procname = "hostid", + .data = &spl_hostid, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dohostid, + }, + { + .procname = "kmem", + .mode = 0555, + .child = spl_kmem_table, + }, + { + .procname = "kstat", + .mode = 0555, + .child = spl_kstat_table, + }, + {}, +}; + +static struct ctl_table spl_dir[] = { + { + .procname = "spl", + .mode = 0555, + .child = spl_table, + }, + {} +}; + +static struct ctl_table spl_root[] = { + { +#ifdef HAVE_CTL_NAME + .ctl_name = CTL_KERN, +#endif + .procname = "kernel", + .mode = 0555, + .child = spl_dir, + }, + {} +}; + +int +spl_proc_init(void) +{ + int rc = 0; + + spl_header = register_sysctl_table(spl_root); + if (spl_header == NULL) + return (-EUNATCH); + + proc_spl = proc_mkdir("spl", NULL); + if (proc_spl == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, + &proc_taskq_all_operations, NULL); + if (proc_spl_taskq_all == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, + &proc_taskq_operations, NULL); + if (proc_spl_taskq == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem = proc_mkdir("kmem", proc_spl); + if (proc_spl_kmem == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem, + &proc_slab_operations, NULL); + if (proc_spl_kmem_slab == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kstat = proc_mkdir("kstat", proc_spl); + if (proc_spl_kstat == NULL) { + rc = -EUNATCH; + goto out; + } +out: + if (rc) { + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + unregister_sysctl_table(spl_header); + } + + return (rc); +} + +void +spl_proc_fini(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + + ASSERT(spl_header != NULL); + unregister_sysctl_table(spl_header); +} diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c new file mode 100644 index 000000000..9a992cc3a --- /dev/null +++ b/module/spl/spl-rwlock.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Reader/Writer Lock Implementation. + */ + +#include <sys/rwlock.h> + +#if defined(CONFIG_PREEMPT_RT_FULL) + +#include <linux/rtmutex.h> +#define RT_MUTEX_OWNER_MASKALL 1UL + +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + + ASSERT((struct task_struct *) + ((unsigned long)rwsem->lock.owner & ~RT_MUTEX_OWNER_MASKALL) == + current); + + /* + * Under the realtime patch series, rwsem is implemented as a + * single mutex held by readers and writers alike. However, + * this implementation would prevent a thread from taking a + * read lock twice, as the mutex would already be locked on + * the second attempt. Therefore the implementation allows a + * single thread to take a rwsem as read lock multiple times + * tracking that nesting as read_depth counter. + */ + if (rwsem->read_depth <= 1) { + /* + * In case, the current thread has not taken the lock + * more than once as read lock, we can allow an + * upgrade to a write lock. rwsem_rt.h implements + * write locks as read_depth == 0. + */ + rwsem->read_depth = 0; + return (1); + } + return (0); +} +#elif defined(CONFIG_RWSEM_GENERIC_SPINLOCK) +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + int ret = 0; + unsigned long flags; + spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags); + if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE && + list_empty(&rwsem->wait_list)) { + ret = 1; + RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE; + } + spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); + return (ret); +} +#elif defined(HAVE_RWSEM_ATOMIC_LONG_COUNT) +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + long val; + val = atomic_long_cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); +} +#else +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + typeof(rwsem->count) val; + val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); +} +#endif + +int +rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + if (__rwsem_tryupgrade(rwsem)) { + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + rwsem->owner = current; +#endif + return (1); + } + return (0); +} +EXPORT_SYMBOL(rwsem_tryupgrade); + +int spl_rw_init(void) { return 0; } +void spl_rw_fini(void) { } diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c new file mode 100644 index 000000000..2919a942a --- /dev/null +++ b/module/spl/spl-taskq.c @@ -0,0 +1,1305 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Task Queue Implementation. + */ + +#include <sys/taskq.h> +#include <sys/kmem.h> +#include <sys/tsd.h> + +int spl_taskq_thread_bind = 0; +module_param(spl_taskq_thread_bind, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); + + +int spl_taskq_thread_dynamic = 1; +module_param(spl_taskq_thread_dynamic, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); + +int spl_taskq_thread_priority = 1; +module_param(spl_taskq_thread_priority, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_priority, + "Allow non-default priority for taskq threads"); + +int spl_taskq_thread_sequential = 4; +module_param(spl_taskq_thread_sequential, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_sequential, + "Create new taskq threads after N sequential tasks"); + +/* Global system-wide dynamic task queue available for all consumers */ +taskq_t *system_taskq; +EXPORT_SYMBOL(system_taskq); +/* Global dynamic task queue for long delay */ +taskq_t *system_delay_taskq; +EXPORT_SYMBOL(system_delay_taskq); + +/* Private dedicated taskq for creating new taskq threads on demand. */ +static taskq_t *dynamic_taskq; +static taskq_thread_t *taskq_thread_create(taskq_t *); + +/* List of all taskqs */ +LIST_HEAD(tq_list); +DECLARE_RWSEM(tq_list_sem); +static uint_t taskq_tsd; + +static int +task_km_flags(uint_t flags) +{ + if (flags & TQ_NOSLEEP) + return (KM_NOSLEEP); + + if (flags & TQ_PUSHPAGE) + return (KM_PUSHPAGE); + + return (KM_SLEEP); +} + +/* + * taskq_find_by_name - Find the largest instance number of a named taskq. + */ +static int +taskq_find_by_name(const char *name) +{ + struct list_head *tql; + taskq_t *tq; + + list_for_each_prev(tql, &tq_list) { + tq = list_entry(tql, taskq_t, tq_taskqs); + if (strcmp(name, tq->tq_name) == 0) + return (tq->tq_instance); + } + return (-1); +} + +/* + * NOTE: Must be called with tq->tq_lock held, returns a list_t which + * is not attached to the free, work, or pending taskq lists. + */ +static taskq_ent_t * +task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) +{ + taskq_ent_t *t; + int count = 0; + + ASSERT(tq); +retry: + /* Acquire taskq_ent_t's from free list if available */ + if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); + ASSERT(!timer_pending(&t->tqent_timer)); + + list_del_init(&t->tqent_list); + return (t); + } + + /* Free list is empty and memory allocations are prohibited */ + if (flags & TQ_NOALLOC) + return (NULL); + + /* Hit maximum taskq_ent_t pool size */ + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (flags & TQ_NOSLEEP) + return (NULL); + + /* + * Sleep periodically polling the free list for an available + * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed + * but we cannot block forever waiting for an taskq_ent_t to + * show up in the free list, otherwise a deadlock can happen. + * + * Therefore, we need to allocate a new task even if the number + * of allocated tasks is above tq->tq_maxalloc, but we still + * end up delaying the task allocation by one second, thereby + * throttling the task dispatch rate. + */ + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + schedule_timeout(HZ / 100); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, + tq->tq_lock_class); + if (count < 100) { + count++; + goto retry; + } + } + + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); + + if (t) { + taskq_init_ent(t); + tq->tq_nalloc++; + } + + return (t); +} + +/* + * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t + * to already be removed from the free, work, or pending taskq lists. + */ +static void +task_free(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + ASSERT(list_empty(&t->tqent_list)); + ASSERT(!timer_pending(&t->tqent_timer)); + + kmem_free(t, sizeof (taskq_ent_t)); + tq->tq_nalloc--; +} + +/* + * NOTE: Must be called with tq->tq_lock held, either destroys the + * taskq_ent_t if too many exist or moves it to the free list for later use. + */ +static void +task_done(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + + /* Wake tasks blocked in taskq_wait_id() */ + wake_up_all(&t->tqent_waitq); + + list_del_init(&t->tqent_list); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->tqent_id = TASKQID_INVALID; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + + list_add_tail(&t->tqent_list, &tq->tq_free_list); + } else { + task_free(tq, t); + } +} + +/* + * When a delayed task timer expires remove it from the delay list and + * add it to the priority list in order for immediate processing. + */ +static void +task_expire_impl(taskq_ent_t *t) +{ + taskq_ent_t *w; + taskq_t *tq = t->tqent_taskq; + struct list_head *l; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (t->tqent_flags & TQENT_FLAG_CANCEL) { + ASSERT(list_empty(&t->tqent_list)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return; + } + + t->tqent_birth = jiffies; + /* + * The priority list must be maintained in strict task id order + * from lowest to highest for lowest_id to be easily calculable. + */ + list_del(&t->tqent_list); + list_for_each_prev(l, &tq->tq_prio_list) { + w = list_entry(l, taskq_ent_t, tqent_list); + if (w->tqent_id < t->tqent_id) { + list_add(&t->tqent_list, l); + break; + } + } + if (l == &tq->tq_prio_list) + list_add(&t->tqent_list, &tq->tq_prio_list); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + wake_up(&tq->tq_work_waitq); +} + +#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST +static void +task_expire(struct timer_list *tl) +{ + taskq_ent_t *t = from_timer(t, tl, tqent_timer); + task_expire_impl(t); +} +#else +static void +task_expire(unsigned long data) +{ + task_expire_impl((taskq_ent_t *)data); +} +#endif + +/* + * Returns the lowest incomplete taskqid_t. The taskqid_t may + * be queued on the pending list, on the priority list, on the + * delay list, or on the work list currently being handled, but + * it is not 100% complete yet. + */ +static taskqid_t +taskq_lowest_id(taskq_t *tq) +{ + taskqid_t lowest_id = tq->tq_next_id; + taskq_ent_t *t; + taskq_thread_t *tqt; + + ASSERT(tq); + + if (!list_empty(&tq->tq_pend_list)) { + t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_prio_list)) { + t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_delay_list)) { + t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_active_list)) { + tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, + tqt_active_list); + ASSERT(tqt->tqt_id != TASKQID_INVALID); + lowest_id = MIN(lowest_id, tqt->tqt_id); + } + + return (lowest_id); +} + +/* + * Insert a task into a list keeping the list sorted by increasing taskqid. + */ +static void +taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) +{ + taskq_thread_t *w; + struct list_head *l; + + ASSERT(tq); + ASSERT(tqt); + + list_for_each_prev(l, &tq->tq_active_list) { + w = list_entry(l, taskq_thread_t, tqt_active_list); + if (w->tqt_id < tqt->tqt_id) { + list_add(&tqt->tqt_active_list, l); + break; + } + } + if (l == &tq->tq_active_list) + list_add(&tqt->tqt_active_list, &tq->tq_active_list); +} + +/* + * Find and return a task from the given list if it exists. The list + * must be in lowest to highest task id order. + */ +static taskq_ent_t * +taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) +{ + struct list_head *l; + taskq_ent_t *t; + + list_for_each(l, lh) { + t = list_entry(l, taskq_ent_t, tqent_list); + + if (t->tqent_id == id) + return (t); + + if (t->tqent_id > id) + break; + } + + return (NULL); +} + +/* + * Find an already dispatched task given the task id regardless of what + * state it is in. If a task is still pending it will be returned. + * If a task is executing, then -EBUSY will be returned instead. + * If the task has already been run then NULL is returned. + */ +static taskq_ent_t * +taskq_find(taskq_t *tq, taskqid_t id) +{ + taskq_thread_t *tqt; + struct list_head *l; + taskq_ent_t *t; + + t = taskq_find_list(tq, &tq->tq_delay_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_prio_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_pend_list, id); + if (t) + return (t); + + list_for_each(l, &tq->tq_active_list) { + tqt = list_entry(l, taskq_thread_t, tqt_active_list); + if (tqt->tqt_id == id) { + /* + * Instead of returning tqt_task, we just return a non + * NULL value to prevent misuse, since tqt_task only + * has two valid fields. + */ + return (ERR_PTR(-EBUSY)); + } + } + + return (NULL); +} + +/* + * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and + * taskq_wait() functions below. + * + * Taskq waiting is accomplished by tracking the lowest outstanding task + * id and the next available task id. As tasks are dispatched they are + * added to the tail of the pending, priority, or delay lists. As worker + * threads become available the tasks are removed from the heads of these + * lists and linked to the worker threads. This ensures the lists are + * kept sorted by lowest to highest task id. + * + * Therefore the lowest outstanding task id can be quickly determined by + * checking the head item from all of these lists. This value is stored + * with the taskq as the lowest id. It only needs to be recalculated when + * either the task with the current lowest id completes or is canceled. + * + * By blocking until the lowest task id exceeds the passed task id the + * taskq_wait_outstanding() function can be easily implemented. Similarly, + * by blocking until the lowest task id matches the next task id taskq_wait() + * can be implemented. + * + * Callers should be aware that when there are multiple worked threads it + * is possible for larger task ids to complete before smaller ones. Also + * when the taskq contains delay tasks with small task ids callers may + * block for a considerable length of time waiting for them to expire and + * execute. + */ +static int +taskq_wait_id_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (taskq_find(tq, id) == NULL); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_id() function blocks until the passed task id completes. + * This does not guarantee that all lower task ids have completed. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_id); + +static int +taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (id < tq->tq_lowest_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_outstanding() function will block until all tasks with a + * lower taskqid than the passed 'id' have been completed. Note that all + * task id's are assigned monotonically at dispatch time. Zero may be + * passed for the id to indicate all tasks dispatch up to this point, + * but not after, should be waited for. + */ +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + id = id ? id : tq->tq_next_id - 1; + wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_outstanding); + +static int +taskq_wait_check(taskq_t *tq) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (tq->tq_lowest_id == tq->tq_next_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait() function will block until the taskq is empty. + * This means that if a taskq re-dispatches work to itself taskq_wait() + * callers will block indefinitely. + */ +void +taskq_wait(taskq_t *tq) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); +} +EXPORT_SYMBOL(taskq_wait); + +int +taskq_member(taskq_t *tq, kthread_t *t) +{ + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); +} +EXPORT_SYMBOL(taskq_member); + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + taskq_ent_t *t; + int rc = ENOENT; + unsigned long flags; + + ASSERT(tq); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + t = taskq_find(tq, id); + if (t && t != ERR_PTR(-EBUSY)) { + list_del_init(&t->tqent_list); + t->tqent_flags |= TQENT_FLAG_CANCEL; + + /* + * When canceling the lowest outstanding task id we + * must recalculate the new lowest outstanding id. + */ + if (tq->tq_lowest_id == t->tqent_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); + } + + /* + * The task_expire() function takes the tq->tq_lock so drop + * drop the lock before synchronously cancelling the timer. + */ + if (timer_pending(&t->tqent_timer)) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + del_timer_sync(&t->tqent_timer); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + rc = 0; + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + + if (t == ERR_PTR(-EBUSY)) { + taskq_wait_id(tq, id); + rc = EBUSY; + } + + return (rc); +} +EXPORT_SYMBOL(taskq_cancel_id); + +static int taskq_thread_spawn(taskq_t *tq); + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *t; + taskqid_t rc = TASKQID_INVALID; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + /* Do not queue the task unless there is idle thread for it */ + ASSERT(tq->tq_nactive <= tq->tq_nthreads); + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out; + } + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ + if (flags & TQ_NOQUEUE) + list_add(&t->tqent_list, &tq->tq_prio_list); + /* Queue to the priority list instead of the pending list */ + else if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; +#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST + t->tqent_timer.data = 0; +#endif + t->tqent_timer.function = NULL; + t->tqent_timer.expires = 0; + t->tqent_birth = jiffies; + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch); + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskqid_t rc = TASKQID_INVALID; + taskq_ent_t *t; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the delay list for subsequent execution */ + list_add_tail(&t->tqent_list, &tq->tq_delay_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; +#ifndef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST + t->tqent_timer.data = (unsigned long)t; +#endif + t->tqent_timer.function = task_expire; + t->tqent_timer.expires = (unsigned long)expire_time; + add_timer(&t->tqent_timer); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch_delay); + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) { + t->tqent_id = TASKQID_INVALID; + goto out; + } + + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out2; + flags |= TQ_FRONT; + } + + spin_lock(&t->tqent_lock); + + /* + * Make sure the entry is not on some other taskq; it is important to + * ASSERT() under lock + */ + ASSERT(taskq_empty_ent(t)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + + /* Queue to the priority list instead of the pending list */ + if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_birth = jiffies; + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); +out2: + spin_unlock_irqrestore(&tq->tq_lock, irqflags); +} +EXPORT_SYMBOL(taskq_dispatch_ent); + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (list_empty(&t->tqent_list)); +} +EXPORT_SYMBOL(taskq_empty_ent); + +void +taskq_init_ent(taskq_ent_t *t) +{ + spin_lock_init(&t->tqent_lock); + init_waitqueue_head(&t->tqent_waitq); +#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST + timer_setup(&t->tqent_timer, NULL, 0); +#else + init_timer(&t->tqent_timer); +#endif + INIT_LIST_HEAD(&t->tqent_list); + t->tqent_id = 0; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + t->tqent_taskq = NULL; +} +EXPORT_SYMBOL(taskq_init_ent); + +/* + * Return the next pending task, preference is given to tasks on the + * priority list which were dispatched with TQ_FRONT. + */ +static taskq_ent_t * +taskq_next_ent(taskq_t *tq) +{ + struct list_head *list; + + if (!list_empty(&tq->tq_prio_list)) + list = &tq->tq_prio_list; + else if (!list_empty(&tq->tq_pend_list)) + list = &tq->tq_pend_list; + else + return (NULL); + + return (list_entry(list->next, taskq_ent_t, tqent_list)); +} + +/* + * Spawns a new thread for the specified taskq. + */ +static void +taskq_thread_spawn_task(void *arg) +{ + taskq_t *tq = (taskq_t *)arg; + unsigned long flags; + + if (taskq_thread_create(tq) == NULL) { + /* restore spawning count if failed */ + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nspawn--; + spin_unlock_irqrestore(&tq->tq_lock, flags); + } +} + +/* + * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current + * number of threads is insufficient to handle the pending tasks. These + * new threads must be created by the dedicated dynamic_taskq to avoid + * deadlocks between thread creation and memory reclaim. The system_taskq + * which is also a dynamic taskq cannot be safely used for this. + */ +static int +taskq_thread_spawn(taskq_t *tq) +{ + int spawning = 0; + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && + (tq->tq_flags & TASKQ_ACTIVE)) { + spawning = (++tq->tq_nspawn); + taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, + tq, TQ_NOSLEEP); + } + + return (spawning); +} + +/* + * Threads in a dynamic taskq should only exit once it has been completely + * drained and no other threads are actively servicing tasks. This prevents + * threads from being created and destroyed more than is required. + * + * The first thread is the thread list is treated as the primary thread. + * There is nothing special about the primary thread but in order to avoid + * all the taskq pids from changing we opt to make it long running. + */ +static int +taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) +{ + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, + tqt_thread_list) == tqt) + return (0); + + return + ((tq->tq_nspawn == 0) && /* No threads are being spawned */ + (tq->tq_nactive == 0) && /* No threads are handling tasks */ + (tq->tq_nthreads > 1) && /* More than 1 thread is running */ + (!taskq_next_ent(tq)) && /* There are no pending tasks */ + (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ +} + +static int +taskq_thread(void *args) +{ + DECLARE_WAITQUEUE(wait, current); + sigset_t blocked; + taskq_thread_t *tqt = args; + taskq_t *tq; + taskq_ent_t *t; + int seq_tasks = 0; + unsigned long flags; + taskq_ent_t dup_task = {}; + + ASSERT(tqt); + ASSERT(tqt->tqt_tq); + tq = tqt->tqt_tq; + current->flags |= PF_NOFREEZE; + + (void) spl_fstrans_mark(); + + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + tsd_set(taskq_tsd, tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* + * If we are dynamically spawned, decrease spawning count. Note that + * we could be created during taskq_create, in which case we shouldn't + * do the decrement. But it's fine because taskq_create will reset + * tq_nspawn later. + */ + if (tq->tq_flags & TASKQ_DYNAMIC) + tq->tq_nspawn--; + + /* Immediately exit if more threads than allowed were created. */ + if (tq->tq_nthreads >= tq->tq_maxthreads) + goto error; + + tq->tq_nthreads++; + list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); + wake_up(&tq->tq_wait_waitq); + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (list_empty(&tq->tq_pend_list) && + list_empty(&tq->tq_prio_list)) { + + if (taskq_thread_should_stop(tq, tqt)) { + wake_up_all(&tq->tq_wait_waitq); + break; + } + + add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + schedule(); + seq_tasks = 0; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + remove_wait_queue(&tq->tq_work_waitq, &wait); + } else { + __set_current_state(TASK_RUNNING); + } + + if ((t = taskq_next_ent(tq)) != NULL) { + list_del_init(&t->tqent_list); + + /* + * A TQENT_FLAG_PREALLOC task may be reused or freed + * during the task function call. Store tqent_id and + * tqent_flags here. + * + * Also use an on stack taskq_ent_t for tqt_task + * assignment in this case. We only populate the two + * fields used by the only user in taskq proc file. + */ + tqt->tqt_id = t->tqent_id; + tqt->tqt_flags = t->tqent_flags; + + if (t->tqent_flags & TQENT_FLAG_PREALLOC) { + dup_task.tqent_func = t->tqent_func; + dup_task.tqent_arg = t->tqent_arg; + t = &dup_task; + } + tqt->tqt_task = t; + + taskq_insert_in_order(tq, tqt); + tq->tq_nactive++; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* Perform the requested task */ + t->tqent_func(t->tqent_arg); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nactive--; + list_del_init(&tqt->tqt_active_list); + tqt->tqt_task = NULL; + + /* For prealloc'd tasks, we don't free anything. */ + if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + /* + * When the current lowest outstanding taskqid is + * done calculate the new lowest outstanding id + */ + if (tq->tq_lowest_id == tqt->tqt_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); + } + + /* Spawn additional taskq threads if required. */ + if ((++seq_tasks) > spl_taskq_thread_sequential && + taskq_thread_spawn(tq)) + seq_tasks = 0; + + tqt->tqt_id = TASKQID_INVALID; + tqt->tqt_flags = 0; + wake_up_all(&tq->tq_wait_waitq); + } else { + if (taskq_thread_should_stop(tq, tqt)) + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + } + + __set_current_state(TASK_RUNNING); + tq->tq_nthreads--; + list_del_init(&tqt->tqt_thread_list); +error: + kmem_free(tqt, sizeof (taskq_thread_t)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + tsd_set(taskq_tsd, NULL); + + return (0); +} + +static taskq_thread_t * +taskq_thread_create(taskq_t *tq) +{ + static int last_used_cpu = 0; + taskq_thread_t *tqt; + + tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); + INIT_LIST_HEAD(&tqt->tqt_thread_list); + INIT_LIST_HEAD(&tqt->tqt_active_list); + tqt->tqt_tq = tq; + tqt->tqt_id = TASKQID_INVALID; + + tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, + "%s", tq->tq_name); + if (tqt->tqt_thread == NULL) { + kmem_free(tqt, sizeof (taskq_thread_t)); + return (NULL); + } + + if (spl_taskq_thread_bind) { + last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); + kthread_bind(tqt->tqt_thread, last_used_cpu); + } + + if (spl_taskq_thread_priority) + set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); + + wake_up_process(tqt->tqt_thread); + + return (tqt); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int count = 0, rc = 0, i; + unsigned long irqflags; + + ASSERT(name != NULL); + ASSERT(minalloc >= 0); + ASSERT(maxalloc <= INT_MAX); + ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ + + /* Scale the number of threads using nthreads as a percentage */ + if (flags & TASKQ_THREADS_CPU_PCT) { + ASSERT(nthreads <= 100); + ASSERT(nthreads >= 0); + nthreads = MIN(nthreads, 100); + nthreads = MAX(nthreads, 0); + nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + } + + tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); + if (tq == NULL) + return (NULL); + + spin_lock_init(&tq->tq_lock); + INIT_LIST_HEAD(&tq->tq_thread_list); + INIT_LIST_HEAD(&tq->tq_active_list); + tq->tq_name = strdup(name); + tq->tq_nactive = 0; + tq->tq_nthreads = 0; + tq->tq_nspawn = 0; + tq->tq_maxthreads = nthreads; + tq->tq_pri = pri; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nalloc = 0; + tq->tq_flags = (flags | TASKQ_ACTIVE); + tq->tq_next_id = TASKQID_INITIAL; + tq->tq_lowest_id = TASKQID_INITIAL; + INIT_LIST_HEAD(&tq->tq_free_list); + INIT_LIST_HEAD(&tq->tq_pend_list); + INIT_LIST_HEAD(&tq->tq_prio_list); + INIT_LIST_HEAD(&tq->tq_delay_list); + init_waitqueue_head(&tq->tq_work_waitq); + init_waitqueue_head(&tq->tq_wait_waitq); + tq->tq_lock_class = TQ_LOCK_GENERAL; + INIT_LIST_HEAD(&tq->tq_taskqs); + + if (flags & TASKQ_PREPOPULATE) { + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + for (i = 0; i < minalloc; i++) + task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, + &irqflags)); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + } + + if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) + nthreads = 1; + + for (i = 0; i < nthreads; i++) { + tqt = taskq_thread_create(tq); + if (tqt == NULL) + rc = 1; + else + count++; + } + + /* Wait for all threads to be started before potential destroy */ + wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); + /* + * taskq_thread might have touched nspawn, but we don't want them to + * because they're not dynamically spawned. So we reset it to 0 + */ + tq->tq_nspawn = 0; + + if (rc) { + taskq_destroy(tq); + tq = NULL; + } else { + down_write(&tq_list_sem); + tq->tq_instance = taskq_find_by_name(name) + 1; + list_add_tail(&tq->tq_taskqs, &tq_list); + up_write(&tq_list_sem); + } + + return (tq); +} +EXPORT_SYMBOL(taskq_create); + +void +taskq_destroy(taskq_t *tq) +{ + struct task_struct *thread; + taskq_thread_t *tqt; + taskq_ent_t *t; + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + tq->tq_flags &= ~TASKQ_ACTIVE; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* + * When TASKQ_ACTIVE is clear new tasks may not be added nor may + * new worker threads be spawned for dynamic taskq. + */ + if (dynamic_taskq != NULL) + taskq_wait_outstanding(dynamic_taskq, 0); + + taskq_wait(tq); + + /* remove taskq from global list used by the kstats */ + down_write(&tq_list_sem); + list_del(&tq->tq_taskqs); + up_write(&tq_list_sem); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* wait for spawning threads to insert themselves to the list */ + while (tq->tq_nspawn) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + schedule_timeout_interruptible(1); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + /* + * Signal each thread to exit and block until it does. Each thread + * is responsible for removing itself from the list and freeing its + * taskq_thread_t. This allows for idle threads to opt to remove + * themselves from the taskq. They can be recreated as needed. + */ + while (!list_empty(&tq->tq_thread_list)) { + tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + while (!list_empty(&tq->tq_free_list)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + list_del_init(&t->tqent_list); + task_free(tq, t); + } + + ASSERT0(tq->tq_nthreads); + ASSERT0(tq->tq_nalloc); + ASSERT0(tq->tq_nspawn); + ASSERT(list_empty(&tq->tq_thread_list)); + ASSERT(list_empty(&tq->tq_active_list)); + ASSERT(list_empty(&tq->tq_free_list)); + ASSERT(list_empty(&tq->tq_pend_list)); + ASSERT(list_empty(&tq->tq_prio_list)); + ASSERT(list_empty(&tq->tq_delay_list)); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + strfree(tq->tq_name); + kmem_free(tq, sizeof (taskq_t)); +} +EXPORT_SYMBOL(taskq_destroy); + + +static unsigned int spl_taskq_kick = 0; + +/* + * 2.6.36 API Change + * module_param_cb is introduced to take kernel_param_ops and + * module_param_call is marked as obsolete. Also set and get operations + * were changed to take a 'const struct kernel_param *'. + */ +static int +#ifdef module_param_cb +param_set_taskq_kick(const char *val, const struct kernel_param *kp) +#else +param_set_taskq_kick(const char *val, struct kernel_param *kp) +#endif +{ + int ret; + taskq_t *tq; + taskq_ent_t *t; + unsigned long flags; + + ret = param_set_uint(val, kp); + if (ret < 0 || !spl_taskq_kick) + return (ret); + /* reset value */ + spl_taskq_kick = 0; + + down_read(&tq_list_sem); + list_for_each_entry(tq, &tq_list, tq_taskqs) { + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + /* Check if the first pending is older than 5 seconds */ + t = taskq_next_ent(tq); + if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { + (void) taskq_thread_spawn(tq); + printk(KERN_INFO "spl: Kicked taskq %s/%d\n", + tq->tq_name, tq->tq_instance); + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + } + up_read(&tq_list_sem); + return (ret); +} + +#ifdef module_param_cb +static const struct kernel_param_ops param_ops_taskq_kick = { + .set = param_set_taskq_kick, + .get = param_get_uint, +}; +module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); +#else +module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, + &spl_taskq_kick, 0644); +#endif +MODULE_PARM_DESC(spl_taskq_kick, + "Write nonzero to kick stuck taskqs to spawn more threads"); + +int +spl_taskq_init(void) +{ + tsd_create(&taskq_tsd, NULL); + + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_taskq == NULL) + return (1); + + system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_delay_taskq == NULL) { + taskq_destroy(system_taskq); + return (1); + } + + dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + if (dynamic_taskq == NULL) { + taskq_destroy(system_taskq); + taskq_destroy(system_delay_taskq); + return (1); + } + + /* + * This is used to annotate tq_lock, so + * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch + * does not trigger a lockdep warning re: possible recursive locking + */ + dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; + + return (0); +} + +void +spl_taskq_fini(void) +{ + taskq_destroy(dynamic_taskq); + dynamic_taskq = NULL; + + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; + + taskq_destroy(system_taskq); + system_taskq = NULL; + + tsd_destroy(&taskq_tsd); +} diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c new file mode 100644 index 000000000..d441ad65f --- /dev/null +++ b/module/spl/spl-thread.c @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Thread Implementation. + */ + +#include <sys/thread.h> +#include <sys/kmem.h> +#include <sys/tsd.h> + +/* + * Thread interfaces + */ +typedef struct thread_priv_s { + unsigned long tp_magic; /* Magic */ + int tp_name_size; /* Name size */ + char *tp_name; /* Name (without _thread suffix) */ + void (*tp_func)(void *); /* Registered function */ + void *tp_args; /* Args to be passed to function */ + size_t tp_len; /* Len to be passed to function */ + int tp_state; /* State to start thread at */ + pri_t tp_pri; /* Priority to start threat at */ +} thread_priv_t; + +static int +thread_generic_wrapper(void *arg) +{ + thread_priv_t *tp = (thread_priv_t *)arg; + void (*func)(void *); + void *args; + + ASSERT(tp->tp_magic == TP_MAGIC); + func = tp->tp_func; + args = tp->tp_args; + set_current_state(tp->tp_state); + set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); + kmem_free(tp->tp_name, tp->tp_name_size); + kmem_free(tp, sizeof (thread_priv_t)); + + if (func) + func(args); + + return (0); +} + +void +__thread_exit(void) +{ + tsd_exit(); + complete_and_exit(NULL, 0); + /* Unreachable */ +} +EXPORT_SYMBOL(__thread_exit); + +/* + * thread_create() may block forever if it cannot create a thread or + * allocate memory. This is preferable to returning a NULL which Solaris + * style callers likely never check for... since it can't fail. + */ +kthread_t * +__thread_create(caddr_t stk, size_t stksize, thread_func_t func, + const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) +{ + thread_priv_t *tp; + struct task_struct *tsk; + char *p; + + /* Option pp is simply ignored */ + /* Variable stack size unsupported */ + ASSERT(stk == NULL); + + tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); + if (tp == NULL) + return (NULL); + + tp->tp_magic = TP_MAGIC; + tp->tp_name_size = strlen(name) + 1; + + tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); + if (tp->tp_name == NULL) { + kmem_free(tp, sizeof (thread_priv_t)); + return (NULL); + } + + strncpy(tp->tp_name, name, tp->tp_name_size); + + /* + * Strip trailing "_thread" from passed name which will be the func + * name since the exposed API has no parameter for passing a name. + */ + p = strstr(tp->tp_name, "_thread"); + if (p) + p[0] = '\0'; + + tp->tp_func = func; + tp->tp_args = args; + tp->tp_len = len; + tp->tp_state = state; + tp->tp_pri = pri; + + tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, + "%s", tp->tp_name); + if (IS_ERR(tsk)) + return (NULL); + + wake_up_process(tsk); + return ((kthread_t *)tsk); +} +EXPORT_SYMBOL(__thread_create); + +/* + * spl_kthread_create - Wrapper providing pre-3.13 semantics for + * kthread_create() in which it is not killable and less likely + * to return -ENOMEM. + */ +struct task_struct * +spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) +{ + struct task_struct *tsk; + va_list args; + char name[TASK_COMM_LEN]; + + va_start(args, namefmt); + vsnprintf(name, sizeof (name), namefmt, args); + va_end(args); + do { + tsk = kthread_create(func, data, "%s", name); + if (IS_ERR(tsk)) { + if (signal_pending(current)) { + clear_thread_flag(TIF_SIGPENDING); + continue; + } + if (PTR_ERR(tsk) == -ENOMEM) + continue; + return (NULL); + } else + return (tsk); + } while (1); +} +EXPORT_SYMBOL(spl_kthread_create); diff --git a/module/spl/spl-tsd.c b/module/spl/spl-tsd.c new file mode 100644 index 000000000..4c800292a --- /dev/null +++ b/module/spl/spl-tsd.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * Solaris Porting Layer (SPL) Thread Specific Data Implementation. + * + * Thread specific data has implemented using a hash table, this avoids + * the need to add a member to the task structure and allows maximum + * portability between kernels. This implementation has been optimized + * to keep the tsd_set() and tsd_get() times as small as possible. + * + * The majority of the entries in the hash table are for specific tsd + * entries. These entries are hashed by the product of their key and + * pid because by design the key and pid are guaranteed to be unique. + * Their product also has the desirable properly that it will be uniformly + * distributed over the hash bins providing neither the pid nor key is zero. + * Under linux the zero pid is always the init process and thus won't be + * used, and this implementation is careful to never to assign a zero key. + * By default the hash table is sized to 512 bins which is expected to + * be sufficient for light to moderate usage of thread specific data. + * + * The hash table contains two additional type of entries. They first + * type is entry is called a 'key' entry and it is added to the hash during + * tsd_create(). It is used to store the address of the destructor function + * and it is used as an anchor point. All tsd entries which use the same + * key will be linked to this entry. This is used during tsd_destory() to + * quickly call the destructor function for all tsd associated with the key. + * The 'key' entry may be looked up with tsd_hash_search() by passing the + * key you wish to lookup and DTOR_PID constant as the pid. + * + * The second type of entry is called a 'pid' entry and it is added to the + * hash the first time a process set a key. The 'pid' entry is also used + * as an anchor and all tsd for the process will be linked to it. This + * list is using during tsd_exit() to ensure all registered destructors + * are run for the process. The 'pid' entry may be looked up with + * tsd_hash_search() by passing the PID_KEY constant as the key, and + * the process pid. Note that tsd_exit() is called by thread_exit() + * so if your using the Solaris thread API you should not need to call + * tsd_exit() directly. + * + */ + +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/tsd.h> +#include <linux/hash.h> + +typedef struct tsd_hash_bin { + spinlock_t hb_lock; + struct hlist_head hb_head; +} tsd_hash_bin_t; + +typedef struct tsd_hash_table { + spinlock_t ht_lock; + uint_t ht_bits; + uint_t ht_key; + tsd_hash_bin_t *ht_bins; +} tsd_hash_table_t; + +typedef struct tsd_hash_entry { + uint_t he_key; + pid_t he_pid; + dtor_func_t he_dtor; + void *he_value; + struct hlist_node he_list; + struct list_head he_key_list; + struct list_head he_pid_list; +} tsd_hash_entry_t; + +static tsd_hash_table_t *tsd_hash_table = NULL; + + +/* + * tsd_hash_search - searches hash table for tsd_hash_entry + * @table: hash table + * @key: search key + * @pid: search pid + */ +static tsd_hash_entry_t * +tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) +{ + struct hlist_node *node; + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + hlist_for_each(node, &bin->hb_head) { + entry = list_entry(node, tsd_hash_entry_t, he_list); + if ((entry->he_key == key) && (entry->he_pid == pid)) { + spin_unlock(&bin->hb_lock); + return (entry); + } + } + + spin_unlock(&bin->hb_lock); + return (NULL); +} + +/* + * tsd_hash_dtor - call the destructor and free all entries on the list + * @work: list of hash entries + * + * For a list of entries which have all already been removed from the + * hash call their registered destructor then free the associated memory. + */ +static void +tsd_hash_dtor(struct hlist_head *work) +{ + tsd_hash_entry_t *entry; + + while (!hlist_empty(work)) { + entry = hlist_entry(work->first, tsd_hash_entry_t, he_list); + hlist_del(&entry->he_list); + + if (entry->he_dtor && entry->he_pid != DTOR_PID) + entry->he_dtor(entry->he_value); + + kmem_free(entry, sizeof (tsd_hash_entry_t)); + } +} + +/* + * tsd_hash_add - adds an entry to hash table + * @table: hash table + * @key: search key + * @pid: search pid + * + * The caller is responsible for ensuring the unique key/pid do not + * already exist in the hash table. This possible because all entries + * are thread specific thus a concurrent thread will never attempt to + * add this key/pid. Because multiple bins must be checked to add + * links to the dtor and pid entries the entire table is locked. + */ +static int +tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) +{ + tsd_hash_entry_t *entry, *dtor_entry, *pid_entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int rc = 0; + + ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); + + /* New entry allocate structure, set value, and add to hash */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + entry->he_key = key; + entry->he_pid = pid; + entry->he_value = value; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + spin_lock(&table->ht_lock); + + /* Destructor entry must exist for all valid keys */ + dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID); + ASSERT3P(dtor_entry, !=, NULL); + entry->he_dtor = dtor_entry->he_dtor; + + /* Process entry must exist for all valid processes */ + pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid); + ASSERT3P(pid_entry, !=, NULL); + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + /* Add to the hash, key, and pid lists */ + hlist_add_head(&entry->he_list, &bin->hb_head); + list_add(&entry->he_key_list, &dtor_entry->he_key_list); + list_add(&entry->he_pid_list, &pid_entry->he_pid_list); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (rc); +} + +/* + * tsd_hash_add_key - adds a destructor entry to the hash table + * @table: hash table + * @keyp: search key + * @dtor: key destructor + * + * For every unique key there is a single entry in the hash which is used + * as anchor. All other thread specific entries for this key are linked + * to this anchor via the 'he_key_list' list head. On return they keyp + * will be set to the next available key for the hash table. + */ +static int +tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) +{ + tsd_hash_entry_t *tmp_entry, *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int keys_checked = 0; + + ASSERT3P(table, !=, NULL); + + /* Allocate entry to be used as a destructor for this key */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + /* Determine next available key value */ + spin_lock(&table->ht_lock); + do { + /* Limited to TSD_KEYS_MAX concurrent unique keys */ + if (table->ht_key++ > TSD_KEYS_MAX) + table->ht_key = 1; + + /* Ensure failure when all TSD_KEYS_MAX keys are in use */ + if (keys_checked++ >= TSD_KEYS_MAX) { + spin_unlock(&table->ht_lock); + return (ENOENT); + } + + tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID); + } while (tmp_entry); + + /* Add destructor entry in to hash table */ + entry->he_key = *keyp = table->ht_key; + entry->he_pid = DTOR_PID; + entry->he_dtor = dtor; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_add_pid - adds a process entry to the hash table + * @table: hash table + * @pid: search pid + * + * For every process these is a single entry in the hash which is used + * as anchor. All other thread specific entries for this process are + * linked to this anchor via the 'he_pid_list' list head. + */ +static int +tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid) +{ + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + /* Allocate entry to be used as the process reference */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + spin_lock(&table->ht_lock); + entry->he_key = PID_KEY; + entry->he_pid = pid; + entry->he_dtor = NULL; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_del - delete an entry from hash table, key, and pid lists + * @table: hash table + * @key: search key + * @pid: search pid + */ +static void +tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry) +{ + hlist_del(&entry->he_list); + list_del_init(&entry->he_key_list); + list_del_init(&entry->he_pid_list); +} + +/* + * tsd_hash_table_init - allocate a hash table + * @bits: hash table size + * + * A hash table with 2^bits bins will be created, it may not be resized + * after the fact and must be free'd with tsd_hash_table_fini(). + */ +static tsd_hash_table_t * +tsd_hash_table_init(uint_t bits) +{ + tsd_hash_table_t *table; + int hash, size = (1 << bits); + + table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP); + if (table == NULL) + return (NULL); + + table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP); + if (table->ht_bins == NULL) { + kmem_free(table, sizeof (tsd_hash_table_t)); + return (NULL); + } + + for (hash = 0; hash < size; hash++) { + spin_lock_init(&table->ht_bins[hash].hb_lock); + INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head); + } + + spin_lock_init(&table->ht_lock); + table->ht_bits = bits; + table->ht_key = 1; + + return (table); +} + +/* + * tsd_hash_table_fini - free a hash table + * @table: hash table + * + * Free a hash table allocated by tsd_hash_table_init(). If the hash + * table is not empty this function will call the proper destructor for + * all remaining entries before freeing the memory used by those entries. + */ +static void +tsd_hash_table_fini(tsd_hash_table_t *table) +{ + HLIST_HEAD(work); + tsd_hash_bin_t *bin; + tsd_hash_entry_t *entry; + int size, i; + + ASSERT3P(table, !=, NULL); + spin_lock(&table->ht_lock); + for (i = 0, size = (1 << table->ht_bits); i < size; i++) { + bin = &table->ht_bins[i]; + spin_lock(&bin->hb_lock); + while (!hlist_empty(&bin->hb_head)) { + entry = hlist_entry(bin->hb_head.first, + tsd_hash_entry_t, he_list); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + } + spin_unlock(&bin->hb_lock); + } + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits)); + kmem_free(table, sizeof (tsd_hash_table_t)); +} + +/* + * tsd_remove_entry - remove a tsd entry for this thread + * @entry: entry to remove + * + * Remove the thread specific data @entry for this thread. + * If this is the last entry for this thread, also remove the PID entry. + */ +static void +tsd_remove_entry(tsd_hash_entry_t *entry) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + ASSERT3P(entry, !=, NULL); + + spin_lock(&table->ht_lock); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + /* save the possible pid_entry */ + pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t, + he_pid_list); + + /* remove entry */ + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + + /* if pid_entry is indeed pid_entry, then remove it if it's empty */ + if (pid_entry->he_key == PID_KEY && + list_empty(&pid_entry->he_pid_list)) { + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + } + + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + */ +int +tsd_set(uint_t key, void *value) +{ + tsd_hash_table_t *table; + tsd_hash_entry_t *entry; + pid_t pid; + int rc; + /* mark remove if value is NULL */ + boolean_t remove = (value == NULL); + + table = tsd_hash_table; + pid = curthread->pid; + ASSERT3P(table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (EINVAL); + + /* Entry already exists in hash table update value */ + entry = tsd_hash_search(table, key, pid); + if (entry) { + entry->he_value = value; + /* remove the entry */ + if (remove) + tsd_remove_entry(entry); + return (0); + } + + /* don't create entry if value is NULL */ + if (remove) + return (0); + + /* Add a process entry to the hash if not yet exists */ + entry = tsd_hash_search(table, PID_KEY, pid); + if (entry == NULL) { + rc = tsd_hash_add_pid(table, pid); + if (rc) + return (rc); + } + + rc = tsd_hash_add(table, key, pid, value); + return (rc); +} +EXPORT_SYMBOL(tsd_set); + +/* + * tsd_get - get thread specific data + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get(uint_t key) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, curthread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get); + +/* + * tsd_get_by_thread - get thread specific data for specified thread + * @key: lookup key + * @thread: thread to lookup + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, kthread_t *thread) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, thread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get_by_thread); + +/* + * tsd_create - create thread specific data key + * @keyp: lookup key address + * @dtor: destructor called during tsd_destroy() or tsd_exit() + * + * Provided key must be set to 0 or it assumed to be already in use. + * The dtor is allowed to be NULL in which case no additional cleanup + * for the data is performed during tsd_destroy() or tsd_exit(). + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + ASSERT3P(keyp, !=, NULL); + if (*keyp) + return; + + (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor); +} +EXPORT_SYMBOL(tsd_create); + +/* + * tsd_destroy - destroy thread specific data + * @keyp: lookup key address + * + * Destroys the thread specific data on all threads which use this key. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_destroy(uint_t *keyp) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *dtor_entry, *entry; + tsd_hash_bin_t *dtor_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID); + if (dtor_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All threads which use this key must be linked off of the + * DTOR_PID entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + while (!list_empty(&dtor_entry->he_key_list)) { + entry = list_entry(dtor_entry->he_key_list.next, + tsd_hash_entry_t, he_key_list); + ASSERT3U(dtor_entry->he_key, ==, entry->he_key); + ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)dtor_entry->he_key * + (ulong_t)dtor_entry->he_pid, table->ht_bits); + dtor_entry_bin = &table->ht_bins[hash]; + + spin_lock(&dtor_entry_bin->hb_lock); + tsd_hash_del(table, dtor_entry); + hlist_add_head(&dtor_entry->he_list, &work); + spin_unlock(&dtor_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + *keyp = 0; +} +EXPORT_SYMBOL(tsd_destroy); + +/* + * tsd_exit - destroys all thread specific data for this thread + * + * Destroys all the thread specific data for this thread. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_exit(void) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry, *entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid); + if (pid_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All keys associated with this pid must be linked off of the + * PID_KEY entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + + while (!list_empty(&pid_entry->he_pid_list)) { + entry = list_entry(pid_entry->he_pid_list.next, + tsd_hash_entry_t, he_pid_list); + ASSERT3U(pid_entry->he_pid, ==, entry->he_pid); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} +EXPORT_SYMBOL(tsd_exit); + +int +spl_tsd_init(void) +{ + tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); + if (tsd_hash_table == NULL) + return (1); + + return (0); +} + +void +spl_tsd_fini(void) +{ + tsd_hash_table_fini(tsd_hash_table); + tsd_hash_table = NULL; +} diff --git a/module/spl/spl-vmem.c b/module/spl/spl-vmem.c new file mode 100644 index 000000000..e1a84a911 --- /dev/null +++ b/module/spl/spl-vmem.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/debug.h> +#include <sys/vmem.h> +#include <sys/kmem_cache.h> +#include <sys/shrinker.h> +#include <linux/module.h> + +vmem_t *heap_arena = NULL; +EXPORT_SYMBOL(heap_arena); + +vmem_t *zio_alloc_arena = NULL; +EXPORT_SYMBOL(zio_alloc_arena); + +vmem_t *zio_arena = NULL; +EXPORT_SYMBOL(zio_arena); + +#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */ + +/* + * Return approximate virtual memory usage based on these assumptions: + * + * 1) The major SPL consumer of virtual memory is the kmem cache. + * 2) Memory allocated with vmem_alloc() is short lived and can be ignored. + * 3) Allow a 4MB floor as a generous pad given normal consumption. + * 4) The spl_kmem_cache_sem only contends with cache create/destroy. + */ +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + spl_kmem_cache_t *skc; + size_t alloc = VMEM_FLOOR_SIZE; + + if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE)) + return (VMALLOC_TOTAL); + + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (skc->skc_flags & KMC_VMEM) + alloc += skc->skc_slab_size * skc->skc_slab_total; + } + up_read(&spl_kmem_cache_sem); + + if (typemask & VMEM_ALLOC) + return (MIN(alloc, VMALLOC_TOTAL)); + else if (typemask & VMEM_FREE) + return (MAX(VMALLOC_TOTAL - alloc, 0)); + else + return (0); +} +EXPORT_SYMBOL(vmem_size); + +/* + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. + */ +void * +spl_vmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_VMEM; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_alloc); + +void * +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= (KM_VMEM | KM_ZERO); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_zalloc); + +void +spl_vmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_vmem_free); + +int +spl_vmem_init(void) +{ + return (0); +} + +void +spl_vmem_fini(void) +{ +} diff --git a/module/spl/spl-vnode.c b/module/spl/spl-vnode.c new file mode 100644 index 000000000..28ce21276 --- /dev/null +++ b/module/spl/spl-vnode.c @@ -0,0 +1,779 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Vnode Implementation. + */ + +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/kmem_cache.h> +#include <linux/falloc.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#ifdef HAVE_FDTABLE_HEADER +#include <linux/fdtable.h> +#endif + +vnode_t *rootdir = (vnode_t *)0xabcd1234; +EXPORT_SYMBOL(rootdir); + +static spl_kmem_cache_t *vn_cache; +static spl_kmem_cache_t *vn_file_cache; + +static DEFINE_SPINLOCK(vn_file_lock); +static LIST_HEAD(vn_file_list); + +static int +spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) +{ + int error = -EOPNOTSUPP; + +#ifdef HAVE_FILE_FALLOCATE + if (fp->f_op->fallocate) + error = fp->f_op->fallocate(fp, mode, offset, len); +#else +#ifdef HAVE_INODE_FALLOCATE + if (fp->f_dentry && fp->f_dentry->d_inode && + fp->f_dentry->d_inode->i_op->fallocate) + error = fp->f_dentry->d_inode->i_op->fallocate( + fp->f_dentry->d_inode, mode, offset, len); +#endif /* HAVE_INODE_FALLOCATE */ +#endif /* HAVE_FILE_FALLOCATE */ + + return (error); +} + +static int +spl_filp_fsync(struct file *fp, int sync) +{ +#ifdef HAVE_2ARGS_VFS_FSYNC + return (vfs_fsync(fp, sync)); +#else + return (vfs_fsync(fp, (fp)->f_dentry, sync)); +#endif /* HAVE_2ARGS_VFS_FSYNC */ +} + +static ssize_t +spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_WRITE_PPOS) + return (kernel_write(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(get_ds()); + + ret = vfs_write(file, (__force const char __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +static ssize_t +spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(get_ds()); + + ret = vfs_read(file, (void __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +vtype_t +vn_mode_to_vtype(mode_t mode) +{ + if (S_ISREG(mode)) + return (VREG); + + if (S_ISDIR(mode)) + return (VDIR); + + if (S_ISCHR(mode)) + return (VCHR); + + if (S_ISBLK(mode)) + return (VBLK); + + if (S_ISFIFO(mode)) + return (VFIFO); + + if (S_ISLNK(mode)) + return (VLNK); + + if (S_ISSOCK(mode)) + return (VSOCK); + + return (VNON); +} /* vn_mode_to_vtype() */ +EXPORT_SYMBOL(vn_mode_to_vtype); + +mode_t +vn_vtype_to_mode(vtype_t vtype) +{ + if (vtype == VREG) + return (S_IFREG); + + if (vtype == VDIR) + return (S_IFDIR); + + if (vtype == VCHR) + return (S_IFCHR); + + if (vtype == VBLK) + return (S_IFBLK); + + if (vtype == VFIFO) + return (S_IFIFO); + + if (vtype == VLNK) + return (S_IFLNK); + + if (vtype == VSOCK) + return (S_IFSOCK); + + return (VNON); +} /* vn_vtype_to_mode() */ +EXPORT_SYMBOL(vn_vtype_to_mode); + +vnode_t * +vn_alloc(int flag) +{ + vnode_t *vp; + + vp = kmem_cache_alloc(vn_cache, flag); + if (vp != NULL) { + vp->v_file = NULL; + vp->v_type = 0; + } + + return (vp); +} /* vn_alloc() */ +EXPORT_SYMBOL(vn_alloc); + +void +vn_free(vnode_t *vp) +{ + kmem_cache_free(vn_cache, vp); +} /* vn_free() */ +EXPORT_SYMBOL(vn_free); + +int +vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp, + int x1, void *x2) +{ + struct file *fp; + struct kstat stat; + int rc, saved_umask = 0; + gfp_t saved_gfp; + vnode_t *vp; + + ASSERT(flags & (FWRITE | FREAD)); + ASSERT(seg == UIO_SYSSPACE); + ASSERT(vpp); + *vpp = NULL; + + if (!(flags & FCREAT) && (flags & FWRITE)) + flags |= FEXCL; + + /* + * Note for filp_open() the two low bits must be remapped to mean: + * 01 - read-only -> 00 read-only + * 10 - write-only -> 01 write-only + * 11 - read-write -> 10 read-write + */ + flags--; + + if (flags & FCREAT) + saved_umask = xchg(¤t->fs->umask, 0); + + fp = filp_open(path, flags, mode); + + if (flags & FCREAT) + (void) xchg(¤t->fs->umask, saved_umask); + + if (IS_ERR(fp)) + return (-PTR_ERR(fp)); + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat); +#else + rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); +#endif + if (rc) { + filp_close(fp, 0); + return (-rc); + } + + vp = vn_alloc(KM_SLEEP); + if (!vp) { + filp_close(fp, 0); + return (ENOMEM); + } + + saved_gfp = mapping_gfp_mask(fp->f_mapping); + mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS)); + + mutex_enter(&vp->v_lock); + vp->v_type = vn_mode_to_vtype(stat.mode); + vp->v_file = fp; + vp->v_gfp_mask = saved_gfp; + *vpp = vp; + mutex_exit(&vp->v_lock); + + return (0); +} /* vn_open() */ +EXPORT_SYMBOL(vn_open); + +int +vn_openat(const char *path, uio_seg_t seg, int flags, int mode, + vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd) +{ + char *realpath; + int len, rc; + + ASSERT(vp == rootdir); + + len = strlen(path) + 2; + realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP)); + if (!realpath) + return (ENOMEM); + + (void) snprintf(realpath, len, "/%s", path); + rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2); + kfree(realpath); + + return (rc); +} /* vn_openat() */ +EXPORT_SYMBOL(vn_openat); + +int +vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off, + uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp) +{ + struct file *fp = vp->v_file; + loff_t offset = off; + int rc; + + ASSERT(uio == UIO_WRITE || uio == UIO_READ); + ASSERT(seg == UIO_SYSSPACE); + ASSERT((ioflag & ~FAPPEND) == 0); + + if (ioflag & FAPPEND) + offset = fp->f_pos; + + if (uio & UIO_WRITE) + rc = spl_kernel_write(fp, addr, len, &offset); + else + rc = spl_kernel_read(fp, addr, len, &offset); + + fp->f_pos = offset; + + if (rc < 0) + return (-rc); + + if (residp) { + *residp = len - rc; + } else { + if (rc != len) + return (EIO); + } + + return (0); +} /* vn_rdwr() */ +EXPORT_SYMBOL(vn_rdwr); + +int +vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4) +{ + int rc; + + ASSERT(vp); + ASSERT(vp->v_file); + + mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask); + rc = filp_close(vp->v_file, 0); + vn_free(vp); + + return (-rc); +} /* vn_close() */ +EXPORT_SYMBOL(vn_close); + +/* + * vn_seek() does not actually seek it only performs bounds checking on the + * proposed seek. We perform minimal checking and allow vn_rdwr() to catch + * anything more serious. + */ +int +vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct) +{ + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} +EXPORT_SYMBOL(vn_seek); + +int +vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4) +{ + struct file *fp; + struct kstat stat; + int rc; + + ASSERT(vp); + ASSERT(vp->v_file); + ASSERT(vap); + + fp = vp->v_file; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat); +#else + rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); +#endif + if (rc) + return (-rc); + + vap->va_type = vn_mode_to_vtype(stat.mode); + vap->va_mode = stat.mode; + vap->va_uid = KUID_TO_SUID(stat.uid); + vap->va_gid = KGID_TO_SGID(stat.gid); + vap->va_fsid = 0; + vap->va_nodeid = stat.ino; + vap->va_nlink = stat.nlink; + vap->va_size = stat.size; + vap->va_blksize = stat.blksize; + vap->va_atime = stat.atime; + vap->va_mtime = stat.mtime; + vap->va_ctime = stat.ctime; + vap->va_rdev = stat.rdev; + vap->va_nblocks = stat.blocks; + + return (0); +} +EXPORT_SYMBOL(vn_getattr); + +int +vn_fsync(vnode_t *vp, int flags, void *x3, void *x4) +{ + int datasync = 0; + int error; + int fstrans; + + ASSERT(vp); + ASSERT(vp->v_file); + + if (flags & FDSYNC) + datasync = 1; + + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + error = -spl_filp_fsync(vp->v_file, datasync); + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} /* vn_fsync() */ +EXPORT_SYMBOL(vn_fsync); + +int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, + offset_t offset, void *x6, void *x7) +{ + int error = EOPNOTSUPP; +#ifdef FALLOC_FL_PUNCH_HOLE + int fstrans; +#endif + + if (cmd != F_FREESP || bfp->l_whence != 0) + return (EOPNOTSUPP); + + ASSERT(vp); + ASSERT(vp->v_file); + ASSERT(bfp->l_start >= 0 && bfp->l_len > 0); + +#ifdef FALLOC_FL_PUNCH_HOLE + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + /* + * When supported by the underlying file system preferentially + * use the fallocate() callback to preallocate the space. + */ + error = -spl_filp_fallocate(vp->v_file, + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + bfp->l_start, bfp->l_len); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + if (error == 0) + return (0); +#endif + +#ifdef HAVE_INODE_TRUNCATE_RANGE + if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode && + vp->v_file->f_dentry->d_inode->i_op && + vp->v_file->f_dentry->d_inode->i_op->truncate_range) { + off_t end = bfp->l_start + bfp->l_len; + /* + * Judging from the code in shmem_truncate_range(), + * it seems the kernel expects the end offset to be + * inclusive and aligned to the end of a page. + */ + if (end % PAGE_SIZE != 0) { + end &= ~(off_t)(PAGE_SIZE - 1); + if (end <= bfp->l_start) + return (0); + } + --end; + + vp->v_file->f_dentry->d_inode->i_op->truncate_range( + vp->v_file->f_dentry->d_inode, bfp->l_start, end); + + return (0); + } +#endif + + return (error); +} +EXPORT_SYMBOL(vn_space); + +/* Function must be called while holding the vn_file_lock */ +static file_t * +file_find(int fd, struct task_struct *task) +{ + file_t *fp; + + list_for_each_entry(fp, &vn_file_list, f_list) { + if (fd == fp->f_fd && fp->f_task == task) { + ASSERT(atomic_read(&fp->f_ref) != 0); + return (fp); + } + } + + return (NULL); +} /* file_find() */ + +file_t * +vn_getf(int fd) +{ + struct kstat stat; + struct file *lfp; + file_t *fp; + vnode_t *vp; + int rc = 0; + + if (fd < 0) + return (NULL); + + /* Already open just take an extra reference */ + spin_lock(&vn_file_lock); + + fp = file_find(fd, current); + if (fp) { + lfp = fget(fd); + fput(fp->f_file); + /* + * areleasef() can cause us to see a stale reference when + * userspace has reused a file descriptor before areleasef() + * has run. fput() the stale reference and replace it. We + * retain the original reference count such that the concurrent + * areleasef() will decrement its reference and terminate. + */ + if (lfp != fp->f_file) { + fp->f_file = lfp; + fp->f_vnode->v_file = lfp; + } + atomic_inc(&fp->f_ref); + spin_unlock(&vn_file_lock); + return (fp); + } + + spin_unlock(&vn_file_lock); + + /* File was not yet opened create the object and setup */ + fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP); + if (fp == NULL) + goto out; + + mutex_enter(&fp->f_lock); + + fp->f_fd = fd; + fp->f_task = current; + fp->f_offset = 0; + atomic_inc(&fp->f_ref); + + lfp = fget(fd); + if (lfp == NULL) + goto out_mutex; + + vp = vn_alloc(KM_SLEEP); + if (vp == NULL) + goto out_fget; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&lfp->f_path, &stat); +#else + rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat); +#endif + if (rc) + goto out_vnode; + + mutex_enter(&vp->v_lock); + vp->v_type = vn_mode_to_vtype(stat.mode); + vp->v_file = lfp; + mutex_exit(&vp->v_lock); + + fp->f_vnode = vp; + fp->f_file = lfp; + + /* Put it on the tracking list */ + spin_lock(&vn_file_lock); + list_add(&fp->f_list, &vn_file_list); + spin_unlock(&vn_file_lock); + + mutex_exit(&fp->f_lock); + return (fp); + +out_vnode: + vn_free(vp); +out_fget: + fput(lfp); +out_mutex: + mutex_exit(&fp->f_lock); + kmem_cache_free(vn_file_cache, fp); +out: + return (NULL); +} /* getf() */ +EXPORT_SYMBOL(getf); + +static void releasef_locked(file_t *fp) +{ + ASSERT(fp->f_file); + ASSERT(fp->f_vnode); + + /* Unlinked from list, no refs, safe to free outside mutex */ + fput(fp->f_file); + vn_free(fp->f_vnode); + + kmem_cache_free(vn_file_cache, fp); +} + +void +vn_releasef(int fd) +{ + areleasef(fd, P_FINFO(current)); +} +EXPORT_SYMBOL(releasef); + +void +vn_areleasef(int fd, uf_info_t *fip) +{ + file_t *fp; + struct task_struct *task = (struct task_struct *)fip; + + if (fd < 0) + return; + + spin_lock(&vn_file_lock); + fp = file_find(fd, task); + if (fp) { + atomic_dec(&fp->f_ref); + if (atomic_read(&fp->f_ref) > 0) { + spin_unlock(&vn_file_lock); + return; + } + + list_del(&fp->f_list); + releasef_locked(fp); + } + spin_unlock(&vn_file_lock); +} /* releasef() */ +EXPORT_SYMBOL(areleasef); + + +static void +#ifdef HAVE_SET_FS_PWD_WITH_CONST +vn_set_fs_pwd(struct fs_struct *fs, const struct path *path) +#else +vn_set_fs_pwd(struct fs_struct *fs, struct path *path) +#endif /* HAVE_SET_FS_PWD_WITH_CONST */ +{ + struct path old_pwd; + +#ifdef HAVE_FS_STRUCT_SPINLOCK + spin_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + spin_unlock(&fs->lock); +#else + write_lock(&fs->lock); + old_pwd = fs->pwd; + fs->pwd = *path; + path_get(path); + write_unlock(&fs->lock); +#endif /* HAVE_FS_STRUCT_SPINLOCK */ + + if (old_pwd.dentry) + path_put(&old_pwd); +} + +int +vn_set_pwd(const char *filename) +{ + struct path path; + mm_segment_t saved_fs; + int rc; + + /* + * user_path_dir() and __user_walk() both expect 'filename' to be + * a user space address so we must briefly increase the data segment + * size to ensure strncpy_from_user() does not fail with -EFAULT. + */ + saved_fs = get_fs(); + set_fs(get_ds()); + + rc = user_path_dir(filename, &path); + if (rc) + goto out; + + rc = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS); + if (rc) + goto dput_and_out; + + vn_set_fs_pwd(current->fs, &path); + +dput_and_out: + path_put(&path); +out: + set_fs(saved_fs); + + return (-rc); +} /* vn_set_pwd() */ +EXPORT_SYMBOL(vn_set_pwd); + +static int +vn_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct vnode *vp = buf; + + mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} /* vn_cache_constructor() */ + +static void +vn_cache_destructor(void *buf, void *cdrarg) +{ + struct vnode *vp = buf; + + mutex_destroy(&vp->v_lock); +} /* vn_cache_destructor() */ + +static int +vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + file_t *fp = buf; + + atomic_set(&fp->f_ref, 0); + mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&fp->f_list); + + return (0); +} /* vn_file_cache_constructor() */ + +static void +vn_file_cache_destructor(void *buf, void *cdrarg) +{ + file_t *fp = buf; + + mutex_destroy(&fp->f_lock); +} /* vn_file_cache_destructor() */ + +int +spl_vn_init(void) +{ + vn_cache = kmem_cache_create("spl_vn_cache", + sizeof (struct vnode), 64, vn_cache_constructor, + vn_cache_destructor, NULL, NULL, NULL, 0); + + vn_file_cache = kmem_cache_create("spl_vn_file_cache", + sizeof (file_t), 64, vn_file_cache_constructor, + vn_file_cache_destructor, NULL, NULL, NULL, 0); + + return (0); +} /* spl_vn_init() */ + +void +spl_vn_fini(void) +{ + file_t *fp, *next_fp; + int leaked = 0; + + spin_lock(&vn_file_lock); + + list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) { + list_del(&fp->f_list); + releasef_locked(fp); + leaked++; + } + + spin_unlock(&vn_file_lock); + + if (leaked > 0) + printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked); + + kmem_cache_destroy(vn_file_cache); + kmem_cache_destroy(vn_cache); +} /* spl_vn_fini() */ diff --git a/module/spl/spl-xdr.c b/module/spl/spl-xdr.c new file mode 100644 index 000000000..2cc3e2a03 --- /dev/null +++ b/module/spl/spl-xdr.c @@ -0,0 +1,515 @@ +/* + * Copyright (c) 2008-2010 Sun Microsystems, Inc. + * Written by Ricardo Correia <[email protected]> + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) XDR Implementation. + */ + +#include <linux/string.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <rpc/xdr.h> + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +typedef int bool_t; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; + + if (req != XDR_GET_BYTES_AVAIL) + return (FALSE); + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; + + return (TRUE); +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return (FALSE); + + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr)); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + val = *((unsigned char *) cp); + + return (xdrmem_enc_uint32(xdrs, val)); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return (FALSE); + + *((unsigned char *) cp) = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + return (xdrmem_enc_uint32(xdrs, *usp)); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return (FALSE); + + *usp = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_enc_uint32(xdrs, *up)); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return (FALSE); + + return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return (FALSE); + if (!xdrmem_dec_uint32(xdrs, &low)) + return (FALSE); + + *ullp = ((u_longlong_t)high << 32) | low; + + return (TRUE); +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return (FALSE); + + if (!xdrmem_enc_uint(xdrs, sizep)) + return (FALSE); + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return (FALSE); + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return (FALSE); + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return (FALSE); + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return (FALSE); + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return (FALSE); + } + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return (FALSE); + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return (FALSE); + + return (xdrmem_enc_bytes(xdrs, *sp, len)); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return (FALSE); + + if (size > maxsize || size > UINT_MAX - 1) + return (FALSE); + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return (FALSE); + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (memchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return (TRUE); + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return (FALSE); +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/spl/spl-zlib.c b/module/spl/spl-zlib.c new file mode 100644 index 000000000..229e6a44b --- /dev/null +++ b/module/spl/spl-zlib.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * z_compress_level/z_uncompress are nearly identical copies of the + * compress2/uncompress functions provided by the official zlib package + * available at http://zlib.net/. The only changes made we to slightly + * adapt the functions called to match the linux kernel implementation + * of zlib. The full zlib license follows: + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/zmod.h> + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + return (1); + + return (0); +} + +void +spl_zlib_fini(void) +{ + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; +} |