diff options
Diffstat (limited to 'module/os')
48 files changed, 38506 insertions, 0 deletions
diff --git a/module/os/Makefile.in b/module/os/Makefile.in new file mode 100644 index 000000000..b9990d1bc --- /dev/null +++ b/module/os/Makefile.in @@ -0,0 +1 @@ +subdirs-m = linux diff --git a/module/os/linux/Makefile.in b/module/os/linux/Makefile.in new file mode 100644 index 000000000..ab01708a3 --- /dev/null +++ b/module/os/linux/Makefile.in @@ -0,0 +1 @@ +subdirs-m = spl zfs diff --git a/module/os/linux/spl/Makefile.in b/module/os/linux/spl/Makefile.in new file mode 100644 index 000000000..a29c36a2a --- /dev/null +++ b/module/os/linux/spl/Makefile.in @@ -0,0 +1,18 @@ +$(MODULE)-objs += ../os/linux/spl/spl-atomic.o +$(MODULE)-objs += ../os/linux/spl/spl-condvar.o +$(MODULE)-objs += ../os/linux/spl/spl-cred.o +$(MODULE)-objs += ../os/linux/spl/spl-err.o +$(MODULE)-objs += ../os/linux/spl/spl-generic.o +$(MODULE)-objs += ../os/linux/spl/spl-kmem.o +$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o +$(MODULE)-objs += ../os/linux/spl/spl-kobj.o +$(MODULE)-objs += ../os/linux/spl/spl-kstat.o +$(MODULE)-objs += ../os/linux/spl/spl-proc.o +$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o +$(MODULE)-objs += ../os/linux/spl/spl-taskq.o +$(MODULE)-objs += ../os/linux/spl/spl-thread.o +$(MODULE)-objs += ../os/linux/spl/spl-tsd.o +$(MODULE)-objs += ../os/linux/spl/spl-vmem.o +$(MODULE)-objs += ../os/linux/spl/spl-vnode.o +$(MODULE)-objs += ../os/linux/spl/spl-xdr.o +$(MODULE)-objs += ../os/linux/spl/spl-zlib.o diff --git a/module/os/linux/spl/README.md b/module/os/linux/spl/README.md new file mode 100644 index 000000000..57f635aed --- /dev/null +++ b/module/os/linux/spl/README.md @@ -0,0 +1,16 @@ +The Solaris Porting Layer, SPL, is a Linux kernel module which provides a +compatibility layer used by the [ZFS on Linux](http://zfsonlinux.org) project. + +# Installation + +The latest version of the SPL is maintained as part of this repository. +Only when building ZFS version 0.7.x or earlier must an external SPL release +be used. These releases can be found at: + + * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release + * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release + +# Release + +The SPL is released under a GPLv2 license. +For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197` diff --git a/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 new file mode 100644 index 000000000..d159169d1 --- /dev/null +++ b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip new file mode 100644 index 000000000..78535a8ee --- /dev/null +++ b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip @@ -0,0 +1 @@ +COMPATIBILITY LAYER FOR OPENZFS ON LINUX diff --git a/module/os/linux/spl/spl-atomic.c b/module/os/linux/spl/spl-atomic.c new file mode 100644 index 000000000..47ed1886e --- /dev/null +++ b/module/os/linux/spl/spl-atomic.c @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Atomic Implementation. + */ + +#include <sys/atomic.h> + +#ifdef ATOMIC_SPINLOCK +/* Global atomic lock declarations */ +DEFINE_SPINLOCK(atomic32_lock); +DEFINE_SPINLOCK(atomic64_lock); + +EXPORT_SYMBOL(atomic32_lock); +EXPORT_SYMBOL(atomic64_lock); +#endif /* ATOMIC_SPINLOCK */ diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c new file mode 100644 index 000000000..3cc33da62 --- /dev/null +++ b/module/os/linux/spl/spl-condvar.c @@ -0,0 +1,461 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include <sys/condvar.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <linux/hrtimer.h> +#include <linux/compiler_compat.h> +#include <linux/mod_compat.h> + +#include <linux/sched.h> + +#ifdef HAVE_SCHED_SIGNAL_HEADER +#include <linux/sched/signal.h> +#endif + +#define MAX_HRTIMEOUT_SLACK_US 1000 +unsigned int spl_schedule_hrtimeout_slack_us = 0; + +static int +param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + if (val > MAX_HRTIMEOUT_SLACK_US) + return (-EINVAL); + + error = param_set_uint(buf, kp); + if (error < 0) + return (error); + + return (0); +} + +module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack, + param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644); +MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us, + "schedule_hrtimeout_range() delta/slack value in us, default(0)"); + +void +__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ + ASSERT(cvp); + ASSERT(name == NULL); + ASSERT(type == CV_DEFAULT); + ASSERT(arg == NULL); + + cvp->cv_magic = CV_MAGIC; + init_waitqueue_head(&cvp->cv_event); + init_waitqueue_head(&cvp->cv_destroy); + atomic_set(&cvp->cv_waiters, 0); + atomic_set(&cvp->cv_refs, 1); + cvp->cv_mutex = NULL; +} +EXPORT_SYMBOL(__cv_init); + +static int +cv_destroy_wakeup(kcondvar_t *cvp) +{ + if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { + ASSERT(cvp->cv_mutex == NULL); + ASSERT(!waitqueue_active(&cvp->cv_event)); + return (1); + } + + return (0); +} + +void +__cv_destroy(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + + cvp->cv_magic = CV_DESTROY; + atomic_dec(&cvp->cv_refs); + + /* Block until all waiters are woken and references dropped. */ + while (cv_destroy_wakeup(cvp) == 0) + wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); + + ASSERT3P(cvp->cv_mutex, ==, NULL); + ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); + ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); + ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); +} +EXPORT_SYMBOL(__cv_destroy); + +static void +cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + atomic_inc(&cvp->cv_refs); + + m = READ_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + io_schedule(); + else + schedule(); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); +} + +void +__cv_wait(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(__cv_wait); + +void +__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1); +} +EXPORT_SYMBOL(__cv_wait_io); + +int +__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1); + + return (signal_pending(current) ? 0 : 1); +} +EXPORT_SYMBOL(__cv_wait_io_sig); + +int +__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); + + return (signal_pending(current) ? 0 : 1); +} +EXPORT_SYMBOL(__cv_wait_sig); + +#if defined(HAVE_IO_SCHEDULE_TIMEOUT) +#define spl_io_schedule_timeout(t) io_schedule_timeout(t) +#else + +struct spl_task_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void +__cv_wakeup(spl_timer_list_t t) +{ + struct timer_list *tmr = (struct timer_list *)t; + struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer); + + wake_up_process(task_timer->task); +} + +static long +spl_io_schedule_timeout(long time_left) +{ + long expire_time = jiffies + time_left; + struct spl_task_timer task_timer; + struct timer_list *timer = &task_timer.timer; + + task_timer.task = current; + + timer_setup(timer, __cv_wakeup, 0); + + timer->expires = expire_time; + add_timer(timer); + + io_schedule(); + + del_timer_sync(timer); + + time_left = expire_time - jiffies; + + return (time_left < 0 ? 0 : time_left); +} +#endif + +/* + * 'expire_time' argument is an absolute wall clock time in jiffies. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, + int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + clock_t time_left; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + /* XXX - Does not handle jiffie wrap properly */ + time_left = expire_time - jiffies; + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = READ_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + time_left = spl_io_schedule_timeout(time_left); + else + time_left = schedule_timeout(time_left); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); + return (time_left > 0 ? time_left : -1); +} + +clock_t +__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait); + +clock_t +__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 1)); +} +EXPORT_SYMBOL(__cv_timedwait_io); + +clock_t +__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_INTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait_sig); + +/* + * 'expire_time' argument is an absolute clock time in nanoseconds. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, + hrtime_t res, int state) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + hrtime_t time_left; + ktime_t ktime_left; + u64 slack = 0; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + time_left = expire_time - gethrtime(); + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = READ_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + + ktime_left = ktime_set(0, time_left); + slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC), + MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC); + schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + mutex_enter(mp); + time_left = expire_time - gethrtime(); + return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1); +} + +/* + * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. + */ +static clock_t +cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag, int state) +{ + if (!(flag & CALLOUT_FLAG_ABSOLUTE)) + tim += gethrtime(); + + return (__cv_timedwait_hires(cvp, mp, tim, res, state)); +} + +clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_UNINTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_hires); + +clock_t +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_sig_hires); + +void +__cv_signal(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * All waiters are added with WQ_FLAG_EXCLUSIVE so only one + * waiter will be set runnable with each call to wake_up(). + * Additionally wake_up() holds a spin_lock associated with + * the wait queue to ensure we don't race waking up processes. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_signal); + +void +__cv_broadcast(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * Wake_up_all() will wake up all waiters even those which + * have the WQ_FLAG_EXCLUSIVE flag set. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up_all(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_broadcast); diff --git a/module/os/linux/spl/spl-cred.c b/module/os/linux/spl/spl-cred.c new file mode 100644 index 000000000..ea3e903f9 --- /dev/null +++ b/module/os/linux/spl/spl-cred.c @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include <sys/cred.h> + +static int +#ifdef HAVE_KUIDGID_T +cr_groups_search(const struct group_info *group_info, kgid_t grp) +#else +cr_groups_search(const struct group_info *group_info, gid_t grp) +#endif +{ + unsigned int left, right, mid; + int cmp; + + if (!group_info) + return (0); + + left = 0; + right = group_info->ngroups; + while (left < right) { + mid = (left + right) / 2; + cmp = KGID_TO_SGID(grp) - + KGID_TO_SGID(GROUP_AT(group_info, mid)); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return (1); + } + return (0); +} + +/* Hold a reference on the credential */ +void +crhold(cred_t *cr) +{ + (void) get_cred((const cred_t *)cr); +} + +/* Free a reference on the credential */ +void +crfree(cred_t *cr) +{ + put_cred((const cred_t *)cr); +} + +/* Return the number of supplemental groups */ +int +crgetngroups(const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = gi->ngroups; +#ifndef HAVE_GROUP_INFO_GID + /* + * For Linux <= 4.8, + * crgetgroups will only returns gi->blocks[0], which contains only + * the first NGROUPS_PER_BLOCK groups. + */ + if (rc > NGROUPS_PER_BLOCK) { + WARN_ON_ONCE(1); + rc = NGROUPS_PER_BLOCK; + } +#endif + return (rc); +} + +/* + * Return an array of supplemental gids. The returned address is safe + * to use as long as the caller has taken a reference with crhold(). + * + * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d + * array via ->gid. + */ +gid_t * +crgetgroups(const cred_t *cr) +{ + struct group_info *gi; + gid_t *gids = NULL; + + gi = cr->group_info; +#ifdef HAVE_GROUP_INFO_GID + gids = KGIDP_TO_SGIDP(gi->gid); +#else + if (gi->nblocks > 0) + gids = KGIDP_TO_SGIDP(gi->blocks[0]); +#endif + return (gids); +} + +/* Check if the passed gid is available in supplied credential. */ +int +groupmember(gid_t gid, const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = cr_groups_search(gi, SGID_TO_KGID(gid)); + + return (rc); +} + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->euid)); +} + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->uid)); +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->suid)); +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->fsuid)); +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->egid)); +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->gid)); +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->sgid)); +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->fsgid)); +} + +EXPORT_SYMBOL(crhold); +EXPORT_SYMBOL(crfree); +EXPORT_SYMBOL(crgetuid); +EXPORT_SYMBOL(crgetruid); +EXPORT_SYMBOL(crgetsuid); +EXPORT_SYMBOL(crgetfsuid); +EXPORT_SYMBOL(crgetgid); +EXPORT_SYMBOL(crgetrgid); +EXPORT_SYMBOL(crgetsgid); +EXPORT_SYMBOL(crgetfsgid); +EXPORT_SYMBOL(crgetngroups); +EXPORT_SYMBOL(crgetgroups); +EXPORT_SYMBOL(groupmember); diff --git a/module/os/linux/spl/spl-err.c b/module/os/linux/spl/spl-err.c new file mode 100644 index 000000000..3c0bb71c0 --- /dev/null +++ b/module/os/linux/spl/spl-err.c @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Error Implementation. + */ + +#include <sys/sysmacros.h> +#include <sys/cmn_err.h> + +/* + * It is often useful to actually have the panic crash the node so you + * can then get notified of the event, get the crashdump for later + * analysis and other such goodies. + * But we would still default to the current default of not to do that. + */ +/* BEGIN CSTYLED */ +unsigned int spl_panic_halt; +module_param(spl_panic_halt, uint, 0644); +MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); +/* END CSTYLED */ + +void +spl_dumpstack(void) +{ + printk("Showing stack for process %d\n", current->pid); + dump_stack(); +} +EXPORT_SYMBOL(spl_dumpstack); + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char msg[MAXMSGLEN]; + va_list ap; + + newfile = strrchr(file, '/'); + if (newfile != NULL) + newfile = newfile + 1; + else + newfile = file; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + printk(KERN_EMERG "%s", msg); + printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func); + if (spl_panic_halt) + panic("%s", msg); + + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + + /* Unreachable */ + return (1); +} +EXPORT_SYMBOL(spl_panic); + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + vsnprintf(msg, MAXMSGLEN, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + printk("%s", msg); + break; + case CE_NOTE: + printk(KERN_NOTICE "NOTICE: %s\n", msg); + break; + case CE_WARN: + printk(KERN_WARNING "WARNING: %s\n", msg); + break; + case CE_PANIC: + printk(KERN_EMERG "PANIC: %s\n", msg); + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + } +} /* vcmn_err() */ +EXPORT_SYMBOL(vcmn_err); + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ +EXPORT_SYMBOL(cmn_err); diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c new file mode 100644 index 000000000..1deb2f444 --- /dev/null +++ b/module/os/linux/spl/spl-generic.c @@ -0,0 +1,757 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Generic Implementation. + */ + +#include <sys/sysmacros.h> +#include <sys/systeminfo.h> +#include <sys/vmsystm.h> +#include <sys/kobj.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/vmem.h> +#include <sys/mutex.h> +#include <sys/rwlock.h> +#include <sys/taskq.h> +#include <sys/tsd.h> +#include <sys/zmod.h> +#include <sys/debug.h> +#include <sys/proc.h> +#include <sys/kstat.h> +#include <sys/file.h> +#include <linux/ctype.h> +#include <sys/disp.h> +#include <sys/random.h> +#include <sys/strings.h> +#include <linux/kmod.h> +#include "zfs_gitrev.h" + +char spl_gitrev[64] = ZFS_META_GITREV; + +/* BEGIN CSTYLED */ +unsigned long spl_hostid = 0; +EXPORT_SYMBOL(spl_hostid); +/* BEGIN CSTYLED */ +module_param(spl_hostid, ulong, 0644); +MODULE_PARM_DESC(spl_hostid, "The system hostid."); +/* END CSTYLED */ + +proc_t p0; +EXPORT_SYMBOL(p0); + +/* + * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * + * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose + * is to provide bytes containing random numbers. It is mapped to /dev/urandom + * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's + * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so + * we can implement it using a fast PRNG that we seed using Linux' actual + * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU + * with an independent seed so that all calls to random_get_pseudo_bytes() are + * free of atomic instructions. + * + * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() + * to generate words larger than 128 bits will paradoxically be limited to + * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` + * 128-bit words and selecting the first will implicitly select the second. If + * a caller finds this behavior undesirable, random_get_bytes() should be used + * instead. + * + * XXX: Linux interrupt handlers that trigger within the critical section + * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * see the same numbers. Nothing in the code currently calls this in an + * interrupt handler, so this is considered to be okay. If that becomes a + * problem, we could create a set of per-cpu variables for interrupt handlers + * and use them when in_interrupt() from linux/preempt_mask.h evaluates to + * true. + */ +static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy); + +/* + * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed + * file: + * + * http://xorshift.di.unimi.it/xorshift128plus.c + */ + +static inline uint64_t +spl_rand_next(uint64_t *s) +{ + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + s[0] = s0; + s1 ^= s1 << 23; // a + s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c + return (s[1] + s0); +} + +static inline void +spl_rand_jump(uint64_t *s) +{ + static const uint64_t JUMP[] = + { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i, b; + for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= s[0]; + s1 ^= s[1]; + } + (void) spl_rand_next(s); + } + + s[0] = s0; + s[1] = s1; +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + uint64_t *xp, s[2]; + + ASSERT(ptr); + + xp = get_cpu_var(spl_pseudo_entropy); + + s[0] = xp[0]; + s[1] = xp[1]; + + while (len) { + union { + uint64_t ui64; + uint8_t byte[sizeof (uint64_t)]; + }entropy; + int i = MIN(len, sizeof (uint64_t)); + + len -= i; + entropy.ui64 = spl_rand_next(s); + + while (i--) + *ptr++ = entropy.byte[i]; + } + + xp[0] = s[0]; + xp[1] = s[1]; + + put_cpu_var(spl_pseudo_entropy); + + return (0); +} + + +EXPORT_SYMBOL(random_get_pseudo_bytes); + +#if BITS_PER_LONG == 32 +/* + * Support 64/64 => 64 division on a 32-bit platform. While the kernel + * provides a div64_u64() function for this we do not use it because the + * implementation is flawed. There are cases which return incorrect + * results as late as linux-2.6.35. Until this is fixed upstream the + * spl must provide its own implementation. + * + * This implementation is a slightly modified version of the algorithm + * proposed by the book 'Hacker's Delight'. The original source can be + * found here and is available for use without restriction. + * + * http://www.hackersdelight.org/HDcode/newCode/divDouble.c + */ + +/* + * Calculate number of leading of zeros for a 64-bit value. + */ +static int +nlz64(uint64_t x) +{ + register int n = 0; + + if (x == 0) + return (64); + + if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } + if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } + if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } + if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } + if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } + if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } + + return (n); +} + +/* + * Newer kernels have a div_u64() function but we define our own + * to simplify portability between kernel versions. + */ +static inline uint64_t +__div_u64(uint64_t u, uint32_t v) +{ + (void) do_div(u, v); + return (u); +} + +/* + * Implementation of 64-bit unsigned division for 32-bit machines. + * + * First the procedure takes care of the case in which the divisor is a + * 32-bit quantity. There are two subcases: (1) If the left half of the + * dividend is less than the divisor, one execution of do_div() is all that + * is required (overflow is not possible). (2) Otherwise it does two + * divisions, using the grade school method. + */ +uint64_t +__udivdi3(uint64_t u, uint64_t v) +{ + uint64_t u0, u1, v1, q0, q1, k; + int n; + + if (v >> 32 == 0) { // If v < 2**32: + if (u >> 32 < v) { // If u/v cannot overflow, + return (__div_u64(u, v)); // just do one division. + } else { // If u/v would overflow: + u1 = u >> 32; // Break u into two halves. + u0 = u & 0xFFFFFFFF; + q1 = __div_u64(u1, v); // First quotient digit. + k = u1 - q1 * v; // First remainder, < v. + u0 += (k << 32); + q0 = __div_u64(u0, v); // Seconds quotient digit. + return ((q1 << 32) + q0); + } + } else { // If v >= 2**32: + n = nlz64(v); // 0 <= n <= 31. + v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. + u1 = u >> 1; // To ensure no overflow. + q1 = __div_u64(u1, v1); // Get quotient from + q0 = (q1 << n) >> 31; // Undo normalization and + // division of u by 2. + if (q0 != 0) // Make q0 correct or + q0 = q0 - 1; // too small by 1. + if ((u - q0 * v) >= v) + q0 = q0 + 1; // Now q0 is correct. + + return (q0); + } +} +EXPORT_SYMBOL(__udivdi3); + +/* BEGIN CSTYLED */ +#ifndef abs64 +#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) +#endif +/* END CSTYLED */ + +/* + * Implementation of 64-bit signed division for 32-bit machines. + */ +int64_t +__divdi3(int64_t u, int64_t v) +{ + int64_t q, t; + q = __udivdi3(abs64(u), abs64(v)); + t = (u ^ v) >> 63; // If u, v have different + return ((q ^ t) - t); // signs, negate q. +} +EXPORT_SYMBOL(__divdi3); + +/* + * Implementation of 64-bit unsigned modulo for 32-bit machines. + */ +uint64_t +__umoddi3(uint64_t dividend, uint64_t divisor) +{ + return (dividend - (divisor * __udivdi3(dividend, divisor))); +} +EXPORT_SYMBOL(__umoddi3); + +/* + * Implementation of 64-bit unsigned division/modulo for 32-bit machines. + */ +uint64_t +__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) +{ + uint64_t q = __udivdi3(n, d); + if (r) + *r = n - d * q; + return (q); +} +EXPORT_SYMBOL(__udivmoddi4); + +/* + * Implementation of 64-bit signed division/modulo for 32-bit machines. + */ +int64_t +__divmoddi4(int64_t n, int64_t d, int64_t *r) +{ + int64_t q, rr; + boolean_t nn = B_FALSE; + boolean_t nd = B_FALSE; + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) { + nd = B_TRUE; + d = -d; + } + + q = __udivmoddi4(n, d, (uint64_t *)&rr); + + if (nn != nd) + q = -q; + if (nn) + rr = -rr; + if (r) + *r = rr; + return (q); +} +EXPORT_SYMBOL(__divmoddi4); + +#if defined(__arm) || defined(__arm__) +/* + * Implementation of 64-bit (un)signed division for 32-bit arm machines. + * + * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) + * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, + * and the remainder in {r2, r3}. The return type is specifically left + * set to 'void' to ensure the compiler does not overwrite these registers + * during the return. All results are in registers as per ABI + */ +void +__aeabi_uldivmod(uint64_t u, uint64_t v) +{ + uint64_t res; + uint64_t mod; + + res = __udivdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_uldivmod); + +void +__aeabi_ldivmod(int64_t u, int64_t v) +{ + int64_t res; + uint64_t mod; + + res = __divdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_ldivmod); +#endif /* __arm || __arm__ */ +#endif /* BITS_PER_LONG */ + +/* + * NOTE: The strtoxx behavior is solely based on my reading of the Solaris + * ddi_strtol(9F) man page. I have not verified the behavior of these + * functions against their Solaris counterparts. It is possible that I + * may have misinterpreted the man page or the man page is incorrect. + */ +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_strtoull(const char *, char **, int, unsigned long long *); +int ddi_strtoll(const char *, char **, int, long long *); + +#define define_ddi_strtoux(type, valtype) \ +int ddi_strtou##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + valtype last_value, value = 0; \ + char *ptr = (char *)str; \ + int flag = 1, digit; \ + \ + if (strlen(ptr) == 0) \ + return (EINVAL); \ + \ + /* Auto-detect base based on prefix */ \ + if (!base) { \ + if (str[0] == '0') { \ + if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ + base = 16; /* hex */ \ + ptr += 2; \ + } else if (str[1] >= '0' && str[1] < 8) { \ + base = 8; /* octal */ \ + ptr += 1; \ + } else { \ + return (EINVAL); \ + } \ + } else { \ + base = 10; /* decimal */ \ + } \ + } \ + \ + while (1) { \ + if (isdigit(*ptr)) \ + digit = *ptr - '0'; \ + else if (isalpha(*ptr)) \ + digit = tolower(*ptr) - 'a' + 10; \ + else \ + break; \ + \ + if (digit >= base) \ + break; \ + \ + last_value = value; \ + value = value * base + digit; \ + if (last_value > value) /* Overflow */ \ + return (ERANGE); \ + \ + flag = 1; \ + ptr++; \ + } \ + \ + if (flag) \ + *result = value; \ + \ + if (endptr) \ + *endptr = (char *)(flag ? ptr : str); \ + \ + return (0); \ +} \ + +#define define_ddi_strtox(type, valtype) \ +int ddi_strto##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + int rc; \ + \ + if (*str == '-') { \ + rc = ddi_strtou##type(str + 1, endptr, base, result); \ + if (!rc) { \ + if (*endptr == str + 1) \ + *endptr = (char *)str; \ + else \ + *result = -*result; \ + } \ + } else { \ + rc = ddi_strtou##type(str, endptr, base, result); \ + } \ + \ + return (rc); \ +} + +define_ddi_strtoux(l, unsigned long) +define_ddi_strtox(l, long) +define_ddi_strtoux(ll, unsigned long long) +define_ddi_strtox(ll, long long) + +EXPORT_SYMBOL(ddi_strtoul); +EXPORT_SYMBOL(ddi_strtol); +EXPORT_SYMBOL(ddi_strtoll); +EXPORT_SYMBOL(ddi_strtoull); + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyin); + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyout); + +/* + * Read the unique system identifier from the /etc/hostid file. + * + * The behavior of /usr/bin/hostid on Linux systems with the + * regular eglibc and coreutils is: + * + * 1. Generate the value if the /etc/hostid file does not exist + * or if the /etc/hostid file is less than four bytes in size. + * + * 2. If the /etc/hostid file is at least 4 bytes, then return + * the first four bytes [0..3] in native endian order. + * + * 3. Always ignore bytes [4..] if they exist in the file. + * + * Only the first four bytes are significant, even on systems that + * have a 64-bit word size. + * + * See: + * + * eglibc: sysdeps/unix/sysv/linux/gethostid.c + * coreutils: src/hostid.c + * + * Notes: + * + * The /etc/hostid file on Solaris is a text file that often reads: + * + * # DO NOT EDIT + * "0123456789" + * + * Directly copying this file to Linux results in a constant + * hostid of 4f442023 because the default comment constitutes + * the first four bytes of the file. + * + */ + +char *spl_hostid_path = HW_HOSTID_PATH; +module_param(spl_hostid_path, charp, 0444); +MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); + +static int +hostid_read(uint32_t *hostid) +{ + uint64_t size; + struct _buf *file; + uint32_t value = 0; + int error; + + file = kobj_open_file(spl_hostid_path); + if (file == (struct _buf *)-1) + return (ENOENT); + + error = kobj_get_filesize(file, &size); + if (error) { + kobj_close_file(file); + return (error); + } + + if (size < sizeof (HW_HOSTID_MASK)) { + kobj_close_file(file); + return (EINVAL); + } + + /* + * Read directly into the variable like eglibc does. + * Short reads are okay; native behavior is preserved. + */ + error = kobj_read_file(file, (char *)&value, sizeof (value), 0); + if (error < 0) { + kobj_close_file(file); + return (EIO); + } + + /* Mask down to 32 bits like coreutils does. */ + *hostid = (value & HW_HOSTID_MASK); + kobj_close_file(file); + + return (0); +} + +/* + * Return the system hostid. Preferentially use the spl_hostid module option + * when set, otherwise use the value in the /etc/hostid file. + */ +uint32_t +zone_get_hostid(void *zone) +{ + uint32_t hostid; + + ASSERT3P(zone, ==, NULL); + + if (spl_hostid != 0) + return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); + + if (hostid_read(&hostid) == 0) + return (hostid); + + return (0); +} +EXPORT_SYMBOL(zone_get_hostid); + +static int +spl_kvmem_init(void) +{ + int rc = 0; + + rc = spl_kmem_init(); + if (rc) + return (rc); + + rc = spl_vmem_init(); + if (rc) { + spl_kmem_fini(); + return (rc); + } + + return (rc); +} + +/* + * We initialize the random number generator with 128 bits of entropy from the + * system random number generator. In the improbable case that we have a zero + * seed, we fallback to the system jiffies, unless it is also zero, in which + * situation we use a preprogrammed seed. We step forward by 2^64 iterations to + * initialize each of the per-cpu seeds so that the sequences generated on each + * CPU are guaranteed to never overlap in practice. + */ +static void __init +spl_random_init(void) +{ + uint64_t s[2]; + int i; + + get_random_bytes(s, sizeof (s)); + + if (s[0] == 0 && s[1] == 0) { + if (jiffies != 0) { + s[0] = jiffies; + s[1] = ~0 - jiffies; + } else { + (void) memcpy(s, "improbable seed", sizeof (s)); + } + printk("SPL: get_random_bytes() returned 0 " + "when generating random seed. Setting initial seed to " + "0x%016llx%016llx.\n", cpu_to_be64(s[0]), + cpu_to_be64(s[1])); + } + + for_each_possible_cpu(i) { + uint64_t *wordp = per_cpu(spl_pseudo_entropy, i); + + spl_rand_jump(s); + + wordp[0] = s[0]; + wordp[1] = s[1]; + } +} + +static void +spl_kvmem_fini(void) +{ + spl_vmem_fini(); + spl_kmem_fini(); +} + +static int __init +spl_init(void) +{ + int rc = 0; + + bzero(&p0, sizeof (proc_t)); + spl_random_init(); + + if ((rc = spl_kvmem_init())) + goto out1; + + if ((rc = spl_tsd_init())) + goto out2; + + if ((rc = spl_taskq_init())) + goto out3; + + if ((rc = spl_kmem_cache_init())) + goto out4; + + if ((rc = spl_vn_init())) + goto out5; + + if ((rc = spl_proc_init())) + goto out6; + + if ((rc = spl_kstat_init())) + goto out7; + + if ((rc = spl_zlib_init())) + goto out8; + + return (rc); + +out8: + spl_kstat_fini(); +out7: + spl_proc_fini(); +out6: + spl_vn_fini(); +out5: + spl_kmem_cache_fini(); +out4: + spl_taskq_fini(); +out3: + spl_tsd_fini(); +out2: + spl_kvmem_fini(); +out1: + return (rc); +} + +static void __exit +spl_fini(void) +{ + spl_zlib_fini(); + spl_kstat_fini(); + spl_proc_fini(); + spl_vn_fini(); + spl_kmem_cache_fini(); + spl_taskq_fini(); + spl_tsd_fini(); + spl_kvmem_fini(); +} + +module_init(spl_init); +module_exit(spl_fini); + +MODULE_DESCRIPTION("Solaris Porting Layer"); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_LICENSE("GPL"); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c new file mode 100644 index 000000000..b39867b03 --- /dev/null +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -0,0 +1,1780 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/shrinker.h> +#include <sys/taskq.h> +#include <sys/timer.h> +#include <sys/vmem.h> +#include <sys/wait.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/prefetch.h> + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + +/* + * Cache expiration was implemented because it was part of the default Solaris + * kmem_cache behavior. The idea is that per-cpu objects which haven't been + * accessed in several seconds should be returned to the cache. On the other + * hand Linux slabs never move objects back to the slabs unless there is + * memory pressure on the system. By default the Linux method is enabled + * because it has been shown to improve responsiveness on low memory systems. + * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM; +EXPORT_SYMBOL(spl_kmem_cache_expire); +module_param(spl_kmem_cache_expire, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)"); + +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN; +module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min, + "Minimal number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +/* + * This value defaults to a threshold designed to avoid allocations which + * have been deemed costly by the kernel. + */ +unsigned int spl_kmem_cache_kmem_limit = + ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) / + SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_kmem_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_kmem_limit, + "Objects less than N bytes use the kmalloc"); + +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); +/* END CSTYLED */ + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data types which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker); +SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker, + spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + ptr = (void *)__get_free_pages(lflags, get_order(size)); + } else { + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL); + } + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return (ptr); +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + if (skc->skc_flags & KMC_KMEM) { + ASSERT(ISP2(size)); + free_pages((unsigned long)ptr, get_order(size)); + } else { + vfree(ptr); + } +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); +} + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each offslab object taking in to account alignment + * restrictions and the power-of-two requirement of kv_alloc(). + */ +static inline uint32_t +spl_offslab_size(spl_kmem_cache_t *skc) +{ + return (1UL << (fls64(spl_obj_size(skc)) + 1)); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * KMC_ONSLAB KMC_OFFSLAB + * + * +------------------------+ +-----------------+ + * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+ + * | skc_obj_size <-+ | | +-----------------+ | | + * | spl_kmem_obj_t | | | | + * | skc_obj_size <---+ | +-----------------+ | | + * | spl_kmem_obj_t | | | skc_obj_size | <-+ | + * | ... v | | spl_kmem_obj_t | | + * +------------------------+ +-----------------+ v + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + spl_kmem_obj_t *sko, *n; + void *base, *obj; + uint32_t obj_size, offslab_size = 0; + int i, rc = 0; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + if (skc->skc_flags & KMC_OFFSLAB) + offslab_size = spl_offslab_size(skc); + + for (i = 0; i < sks->sks_objs; i++) { + if (skc->skc_flags & KMC_OFFSLAB) { + obj = kv_alloc(skc, offslab_size, flags); + if (!obj) { + rc = -ENOMEM; + goto out; + } + } else { + obj = base + spl_sks_size(skc) + (i * obj_size); + } + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + +out: + if (rc) { + if (skc->skc_flags & KMC_OFFSLAB) + list_for_each_entry_safe(sko, + n, &sks->sks_free_list, sko_list) { + kv_free(skc, sko->sko_addr, offslab_size); + } + + kv_free(skc, base, skc->skc_slab_size); + sks = NULL; + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Reclaim empty slabs at the end of the partial list. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks, *m; + spl_kmem_obj_t *sko, *n; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + uint32_t size = 0; + + /* + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { + + if (sks->sks_ref > 0) + break; + + spl_slab_free(sks, &sks_list, &sko_list); + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are + * run, any offslab objects are freed, and the slabs themselves + * are freed. This is all done outside the skc->skc_lock since + * this allows the destructor to sleep, and allows us to perform + * a conditional reschedule when a freeing a large number of + * objects and slabs back to the system. + */ + if (skc->skc_flags & KMC_OFFSLAB) + size = spl_offslab_size(skc); + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + + if (skc->skc_flags & KMC_OFFSLAB) + kv_free(skc, sko->sko_addr, size); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < ske->ske_obj) + node = node->rb_left; + else if (address > ske->ske_obj) + node = node->rb_right; + else + return (ske); + } + + return (NULL); +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return (0); + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return (1); +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + gfp_t lflags = kmem_flags_convert(flags); + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof (*ske), lflags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = __get_free_pages(lflags, order); + if (ske->ske_obj == 0) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + free_pages(ske->ske_obj, order); + kfree(ske); + return (-EINVAL); + } + + *obj = (void *)ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (ske) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + return (-ENOENT); + + free_pages(ske->ske_obj, order); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + int i, count = MIN(flush, skm->skm_avail); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + for (i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof (void *) * skm->skm_avail); +} + +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + __spl_cache_flush(skc, skm, flush); + spin_unlock(&skc->skc_lock); +} + +static void +spl_magazine_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_cpu == smp_processor_id()); + ASSERT(irqs_disabled()); + + /* There are no available objects or they are too young to age out */ + if ((skm->skm_avail == 0) || + time_before(jiffies, skm->skm_age + skc->skc_delay * HZ)) + return; + + /* + * Because we're executing in interrupt context we may have + * interrupted the holder of this lock. To avoid a potential + * deadlock return if the lock is contended. + */ + if (!spin_trylock(&skc->skc_lock)) + return; + + __spl_cache_flush(skc, skm, skm->skm_refill); + spin_unlock(&skc->skc_lock); +} + +/* + * Called regularly to keep a downward pressure on the cache. + * + * Objects older than skc->skc_delay seconds in the per-cpu magazines will + * be returned to the caches. This is done to prevent idle magazines from + * holding memory which could be better used elsewhere. The delay is + * present to prevent thrashing the magazine. + * + * The newly released objects may result in empty partial slabs. Those + * slabs should be released to the system. Otherwise moving the objects + * out of the magazines is just wasted work. + */ +static void +spl_cache_age(void *data) +{ + spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data; + taskqid_t id = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* Dynamically disabled at run time */ + if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE)) + return; + + atomic_inc(&skc->skc_ref); + + if (!(skc->skc_flags & KMC_NOMAGAZINE)) + on_each_cpu(spl_magazine_age, skc, 1); + + spl_slab_reclaim(skc); + + while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) { + id = taskq_dispatch_delay( + spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + + /* Destroy issued after dispatch immediately cancel it */ + if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id) + taskq_cancel_id(spl_kmem_cache_taskq, id); + } + + spin_lock(&skc->skc_lock); + skc->skc_taskqid = id; + spin_unlock(&skc->skc_lock); + + atomic_dec(&skc->skc_ref); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; + + if (skc->skc_flags & KMC_OFFSLAB) { + tgt_objs = spl_kmem_cache_obj_per_slab; + tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE); + + if ((skc->skc_flags & KMC_KMEM) && + (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE))) + return (-ENOSPC); + } else { + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); + + /* + * KMC_KMEM slabs are allocated by __get_free_pages() which + * rounds up to the nearest order. Knowing this the size + * should be rounded up to the next power of two with a hard + * maximum defined by the maximum allowed allocation order. + */ + if (skc->skc_flags & KMC_KMEM) { + max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE; + tgt_size = MIN(max_size, + PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1))); + } + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } + } + + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; + + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_age = jiffies; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + kfree(skm); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return (0); + + skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * + num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_possible_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + kfree(skc->skc_mag); + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i; + + if (skc->skc_flags & KMC_NOMAGAZINE) + return; + + for_each_possible_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } + + kfree(skc->skc_mag); +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_KMEM Force SPL kmem backed cache + * KMC_VMEM Force SPL vmem backed cache + * KMC_SLAB Force Linux slab backed cache + * KMC_OFFSLAB Locate objects off the slab + * KMC_NOTOUCH unsupported + * KMC_NODEBUG unsupported + * KMC_NOHASH unsupported + * KMC_QCACHE unsupported + * KMC_NOMAGAZINE unsupported + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim, + void *priv, void *vmp, int flags) +{ + gfp_t lflags = kmem_flags_convert(KM_SLEEP); + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT0(flags & KMC_NOMAGAZINE); + ASSERT0(flags & KMC_NOHASH); + ASSERT0(flags & KMC_QCACHE); + ASSERT(vmp == NULL); + + might_sleep(); + + skc = kzalloc(sizeof (*skc), lflags); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + if (skc->skc_name == NULL) { + kfree(skc); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_reclaim = reclaim; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + skc->skc_delay = SPL_KMEM_CACHE_DELAY; + skc->skc_reap = SPL_KMEM_CACHE_REAP; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) { + + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) { + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. + */ + skc->skc_flags |= KMC_SLAB; + } else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) { + /* + * Small objects, less than spl_kmem_cache_kmem_limit + * per object should use kmem because their slabs are + * small. + */ + skc->skc_flags |= KMC_KMEM; + } else { + /* + * All other objects are considered large and are + * placed on vmem backed slabs. + */ + skc->skc_flags |= KMC_VMEM; + } + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + unsigned long slabflags = 0; + + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + +#if defined(SLAB_USERCOPY) + /* + * Required for PAX-enabled kernels if the slab is to be + * used for copying between user and kernel space. + */ + slabflags |= SLAB_USERCOPY; +#endif + +#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) + /* + * Newer grsec patchset uses kmem_cache_create_usercopy() + * instead of SLAB_USERCOPY flag + */ + skc->skc_linux_cache = kmem_cache_create_usercopy( + skc->skc_name, size, align, slabflags, 0, size, NULL); +#else + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, slabflags, NULL); +#endif + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + +#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS) + skc->skc_linux_cache->allocflags |= __GFP_COMP; +#elif defined(HAVE_KMEM_CACHE_GFPFLAGS) + skc->skc_linux_cache->gfpflags |= __GFP_COMP; +#endif + skc->skc_flags |= KMC_NOMAGAZINE; + } + + if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) { + skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq, + spl_cache_age, skc, TQ_SLEEP, + ddi_get_lbolt() + skc->skc_delay / 3 * HZ); + } + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kfree(skc->skc_name); + kfree(skc); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* + * Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. + */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + spin_unlock(&skc->skc_lock); + + kfree(skc->skc_name); + kfree(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return (sko->sko_addr); +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static int +__spl_cache_grow(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + + fstrans_cookie_t cookie = spl_fstrans_mark(); + sks = spl_slab_alloc(skc, flags); + spl_fstrans_unmark(cookie); + + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + + smp_mb__before_atomic(); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + } + spin_unlock(&skc->skc_lock); + + return (sks == NULL ? -ENOMEM : 0); +} + +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + + (void) __spl_cache_grow(skc, ska->ska_flags); + + atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * To reduce the overhead of context switch and improve NUMA locality, + * it tries to allocate a new slab in the current process context with + * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the + * allocation. + * + * However, this can't be applied to KVM_VMEM due to a bug that + * __vmalloc() doesn't honor gfp flags in page table allocation. + */ + if (!(skc->skc_flags & KMC_VMEM)) { + rc = __spl_cache_grow(skc, flags | KM_NOSLEEP); + if (rc == 0) + return (0); + } + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); + if (ska == NULL) { + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ / 10); + + if (!remaining) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* + * Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. + */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* + * Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. + */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + do { + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + goto ret; + } + + local_irq_disable(); + +restart: + /* + * Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + skm->skm_age = jiffies; + } else { + obj = spl_cache_refill(skc, skm, flags); + if ((obj == NULL) && !(flags & KM_NOSLEEP)) + goto restart; + + local_irq_enable(); + goto ret; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + return (obj); +} +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + int do_reclaim = 0; + int do_emergency = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + return; + } + + /* + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. + */ + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + return; + } + + local_irq_save(flags); + + /* + * Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { + spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * The generic shrinker function for all caches. Under Linux a shrinker + * may not be tightly coupled with a slab cache. In fact Linux always + * systematically tries calling all registered shrinker callbacks which + * report that they contain unused objects. Because of this we only + * register one shrinker function in the shim layer for all slab caches. + * We always attempt to shrink all caches when this generic shrinker + * is called. + * + * If sc->nr_to_scan is zero, the caller is requesting a query of the + * number of objects which can potentially be freed. If it is nonzero, + * the request is to free that many objects. + * + * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks + * in struct shrinker and also require the shrinker to return the number + * of objects freed. + * + * Older kernels require the shrinker to return the number of freeable + * objects following the freeing of nr_to_free. + * + * Linux semantics differ from those under Solaris, which are to + * free all available objects which may (and probably will) be more + * objects than the requested nr_to_scan. + */ +static spl_shrinker_t +__spl_kmem_cache_generic_shrinker(struct shrinker *shrink, + struct shrink_control *sc) +{ + spl_kmem_cache_t *skc; + int alloc = 0; + + /* + * No shrinking in a transaction context. Can cause deadlocks. + */ + if (sc->nr_to_scan && spl_fstrans_check()) + return (SHRINK_STOP); + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (sc->nr_to_scan) { +#ifdef HAVE_SPLIT_SHRINKER_CALLBACK + uint64_t oldalloc = skc->skc_obj_alloc; + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + if (oldalloc > skc->skc_obj_alloc) + alloc += oldalloc - skc->skc_obj_alloc; +#else + spl_kmem_cache_reap_now(skc, + MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1)); + alloc += skc->skc_obj_alloc; +#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */ + } else { + /* Request to query number of freeable objects */ + alloc += skc->skc_obj_alloc; + } + } + up_read(&spl_kmem_cache_sem); + + /* + * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass. + * This functionality only exists to work around a rare issue where + * shrink_slabs() is repeatedly invoked by many cores causing the + * system to thrash. + */ + if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan) + return (SHRINK_STOP); + + return (MAX(alloc, 0)); +} + +SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker); + +/* + * Call the registered reclaim function for a cache. Depending on how + * many and which objects are released it may simply repopulate the + * local magazine which will then need to age-out. Objects which cannot + * fit in the magazine we will be released back to their slabs which will + * also need to age out before being release. This is all just best + * effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + atomic_inc(&skc->skc_ref); + + /* + * Execute the registered reclaim callback if it exists. + */ + if (skc->skc_flags & KMC_SLAB) { + if (skc->skc_reclaim) + skc->skc_reclaim(skc->skc_private); + goto out; + } + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* + * When a reclaim function is available it may be invoked repeatedly + * until at least a single slab can be freed. This ensures that we + * do free memory back to the system. This helps minimize the chance + * of an OOM event when the bulk of memory is used by the slab. + * + * When free slabs are already available the reclaim callback will be + * skipped. Additionally, if no forward progress is detected despite + * a reclaim function the cache will be skipped to avoid deadlock. + * + * Longer term this would be the correct place to add the code which + * repacks the slabs in order minimize fragmentation. + */ + if (skc->skc_reclaim) { + uint64_t objects = UINT64_MAX; + int do_reclaim; + + do { + spin_lock(&skc->skc_lock); + do_reclaim = + (skc->skc_slab_total > 0) && + ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) && + (skc->skc_obj_alloc < objects); + + objects = skc->skc_obj_alloc; + spin_unlock(&skc->skc_lock); + + if (do_reclaim) + skc->skc_reclaim(skc->skc_private); + + } while (do_reclaim); + } + + /* Reclaim from the magazine and free all now empty slabs. */ + if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) { + spl_kmem_magazine_t *skm; + unsigned long irq_flags; + + local_irq_save(irq_flags); + skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + } + + spl_slab_reclaim(skc); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * This is stubbed out for code consistency with other platforms. There + * is existing logic to prevent concurrent reaping so while this is ugly + * it should do no harm. + */ +int +spl_kmem_cache_reap_active() +{ + return (0); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_active); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + struct shrink_control sc; + + sc.nr_to_scan = KMC_REAP_CHUNK; + sc.gfp_mask = GFP_KERNEL; + + (void) __spl_kmem_cache_generic_shrinker(NULL, &sc); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + spl_kmem_cache_kmem_threads, maxclsyspri, + spl_kmem_cache_kmem_threads * 8, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + spl_register_shrinker(&spl_kmem_cache_shrinker); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + spl_unregister_shrinker(&spl_kmem_cache_shrinker); + taskq_destroy(spl_kmem_cache_taskq); +} diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c new file mode 100644 index 000000000..824b5e89f --- /dev/null +++ b/module/os/linux/spl/spl-kmem.c @@ -0,0 +1,556 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/vmem.h> +#include <linux/mm.h> + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to sixteen pages but capped at 64K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); +/* END CSTYLED */ + +int +kmem_debugging(void) +{ + return (0); +} +EXPORT_SYMBOL(kmem_debugging); + +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr; + + do { + va_copy(aq, ap); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); + va_end(aq); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_vasprintf); + +char * +kmem_asprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_asprintf); + +static char * +__strdup(const char *str, int flags) +{ + char *ptr; + int n; + + n = strlen(str); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); + if (ptr) + memcpy(ptr, str, n + 1); + + return (ptr); +} + +char * +strdup(const char *str) +{ + return (__strdup(str, KM_SLEEP)); +} +EXPORT_SYMBOL(strdup); + +void +strfree(char *str) +{ + kfree(str); +} +EXPORT_SYMBOL(strfree); + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + int use_vmem = 0; + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use __vmalloc(). However, in general use of __vmalloc() + * is strongly discouraged because a global lock must be + * acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if ((size > spl_kmem_alloc_max) || use_vmem) { + if (flags & KM_VMEM) { + ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, + PAGE_KERNEL); + } else { + return (NULL); + } + } else { + ptr = kmalloc_node(size, lflags, node); + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + /* + * For vmem_alloc() and vmem_zalloc() callers retry immediately + * using __vmalloc() which is unlikely to fail. + */ + if ((flags & KM_VMEM) && (use_vmem == 0)) { + use_vmem = 1; + continue; + } + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem + */ +#ifdef DEBUG_KMEM + +/* Shim layer memory accounting */ +#ifdef HAVE_ATOMIC64_T +atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); +unsigned long long kmem_alloc_max = 0; +#else /* HAVE_ATOMIC64_T */ +atomic_t kmem_alloc_used = ATOMIC_INIT(0); +unsigned long long kmem_alloc_max = 0; +#endif /* HAVE_ATOMIC64_T */ + +EXPORT_SYMBOL(kmem_alloc_used); +EXPORT_SYMBOL(kmem_alloc_max); + +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked + * but also the location of every alloc and free. When the SPL module is + * unloaded a list of all leaked addresses and where they were allocated + * will be dumped to the console. Enabling this feature has a significant + * impact on performance but it makes finding memory leaks straight forward. + * + * Not surprisingly with debugging enabled the xmem_locks are very highly + * contended particularly on xfree(). If we want to run with this detailed + * debugging enabled for anything other than debugging we need to minimize + * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking + */ +#ifdef DEBUG_KMEM_TRACKING + +#include <linux/hash.h> +#include <linux/ctype.h> + +#define KMEM_HASH_BITS 10 +#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) + +typedef struct kmem_debug { + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ +} kmem_debug_t; + +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; + +static kmem_debug_t * +kmem_del_init(spinlock_t *lock, struct hlist_head *table, + int bits, const void *addr) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kmem_debug *p; + unsigned long flags; + + spin_lock_irqsave(lock, flags); + + head = &table[hash_ptr((void *)addr, bits)]; + hlist_for_each(node, head) { + p = list_entry(node, struct kmem_debug, kd_hlist); + if (p->kd_addr == addr) { + hlist_del_init(&p->kd_hlist); + list_del_init(&p->kd_list); + spin_unlock_irqrestore(lock, flags); + return (p); + } + } + + spin_unlock_irqrestore(lock, flags); + + return (NULL); +} + +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) +{ + void *ptr = NULL; + kmem_debug_t *dptr; + unsigned long irq_flags; + + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); + + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } + + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } + + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); + + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; + + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); + + return (ptr); +} + +inline void +spl_kmem_free_track(const void *ptr, size_t size) +{ + kmem_debug_t *dptr; + + /* Ignore NULL pointer since we haven't tracked it at all */ + if (ptr == NULL) + return; + + /* Must exist in hash due to kmem_alloc() */ + dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); + + kfree(dptr->kd_func); + kfree(dptr); + + spl_kmem_free_debug(ptr, size); +} +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ +void * +spl_kmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); + +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_ZERO; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_zalloc); + +void +spl_kmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_kmem_free); + +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) +{ + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; + + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); + + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ + if (i > min) + break; + + flag = 0; + break; + } + } + + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } + + return (str); +} + +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) +{ + int i; + + spin_lock_init(lock); + INIT_LIST_HEAD(list); + + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); + + return (0); +} + +static void +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +{ + unsigned long flags; + kmem_debug_t *kd; + char str[17]; + + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); + + list_for_each_entry(kd, list, kd_list) { + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + } + + spin_unlock_irqrestore(lock, flags); +} +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ + +int +spl_kmem_init(void) +{ + +#ifdef DEBUG_KMEM + kmem_alloc_used_set(0); + + + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + + return (0); +} + +void +spl_kmem_fini(void) +{ +#ifdef DEBUG_KMEM + /* + * Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. + */ + if (kmem_alloc_used_read() != 0) + printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +} diff --git a/module/os/linux/spl/spl-kobj.c b/module/os/linux/spl/spl-kobj.c new file mode 100644 index 000000000..7019369bd --- /dev/null +++ b/module/os/linux/spl/spl-kobj.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Kobj Implementation. + */ + +#include <sys/kobj.h> + +struct _buf * +kobj_open_file(const char *name) +{ + struct _buf *file; + vnode_t *vp; + int rc; + + file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP)); + if (file == NULL) + return ((_buf_t *)-1UL); + + if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) { + kfree(file); + return ((_buf_t *)-1UL); + } + + file->vp = vp; + + return (file); +} /* kobj_open_file() */ +EXPORT_SYMBOL(kobj_open_file); + +void +kobj_close_file(struct _buf *file) +{ + VOP_CLOSE(file->vp, 0, 0, 0, 0, 0); + kfree(file); +} /* kobj_close_file() */ +EXPORT_SYMBOL(kobj_close_file); + +int +kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) +{ + ssize_t resid; + + if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off, + UIO_SYSSPACE, 0, 0, 0, &resid) != 0) + return (-1); + + return (size - resid); +} /* kobj_read_file() */ +EXPORT_SYMBOL(kobj_read_file); + +int +kobj_get_filesize(struct _buf *file, uint64_t *size) +{ + vattr_t vap; + int rc; + + rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL); + if (rc) + return (rc); + + *size = vap.va_size; + + return (rc); +} /* kobj_get_filesize() */ +EXPORT_SYMBOL(kobj_get_filesize); diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c new file mode 100644 index 000000000..1f67bf157 --- /dev/null +++ b/module/os/linux/spl/spl-kstat.c @@ -0,0 +1,770 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Kstat Implementation. + */ + +#include <linux/seq_file.h> +#include <sys/kstat.h> +#include <sys/vmem.h> +#include <sys/cmn_err.h> +#include <sys/sysmacros.h> + +static kmutex_t kstat_module_lock; +static struct list_head kstat_module_list; +static kid_t kstat_id; + +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + + return (0); +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} +EXPORT_SYMBOL(kstat_waitq_enter); + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} +EXPORT_SYMBOL(kstat_waitq_exit); + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} +EXPORT_SYMBOL(kstat_runq_enter); + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} +EXPORT_SYMBOL(kstat_runq_exit); + +static int +kstat_seq_show_headers(struct seq_file *f) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", + ksp->ks_kid, ksp->ks_type, ksp->ks_flags, + ksp->ks_ndata, (int)ksp->ks_data_size, + ksp->ks_crtime, ksp->ks_snaptime); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + seq_printf(f, "raw data\n"); + } + break; + case KSTAT_TYPE_NAMED: + seq_printf(f, "%-31s %-4s %s\n", + "name", "type", "data"); + break; + case KSTAT_TYPE_INTR: + seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", + "hard", "soft", "watchdog", + "spurious", "multsvc"); + break; + case KSTAT_TYPE_IO: + seq_printf(f, + "%-8s %-8s %-8s %-8s %-8s %-8s " + "%-8s %-8s %-8s %-8s %-8s %-8s\n", + "nread", "nwritten", "reads", "writes", + "wtime", "wlentime", "wupdate", + "rtime", "rlentime", "rupdate", + "wcnt", "rcnt"); + break; + case KSTAT_TYPE_TIMER: + seq_printf(f, + "%-31s %-8s " + "%-8s %-8s %-8s %-8s %-8s\n", + "name", "events", "elapsed", + "min", "max", "start", "stop"); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) +{ + int i, j; + + for (i = 0; ; i++) { + seq_printf(f, "%03x:", i); + + for (j = 0; j < 16; j++) { + if (i * 16 + j >= l) { + seq_printf(f, "\n"); + goto out; + } + + seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); + } + seq_printf(f, "\n"); + } +out: + return (0); +} + +static int +kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) +{ + seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); + + switch (knp->data_type) { + case KSTAT_DATA_CHAR: + knp->value.c[15] = '\0'; /* NULL terminate */ + seq_printf(f, "%-16s", knp->value.c); + break; + /* + * NOTE - We need to be more careful able what tokens are + * used for each arch, for now this is correct for x86_64. + */ + case KSTAT_DATA_INT32: + seq_printf(f, "%d", knp->value.i32); + break; + case KSTAT_DATA_UINT32: + seq_printf(f, "%u", knp->value.ui32); + break; + case KSTAT_DATA_INT64: + seq_printf(f, "%lld", (signed long long)knp->value.i64); + break; + case KSTAT_DATA_UINT64: + seq_printf(f, "%llu", + (unsigned long long)knp->value.ui64); + break; + case KSTAT_DATA_LONG: + seq_printf(f, "%ld", knp->value.l); + break; + case KSTAT_DATA_ULONG: + seq_printf(f, "%lu", knp->value.ul); + break; + case KSTAT_DATA_STRING: + KSTAT_NAMED_STR_PTR(knp) + [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; + seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); + break; + default: + PANIC("Undefined kstat data type %d\n", knp->data_type); + } + + seq_printf(f, "\n"); + + return (0); +} + +static int +kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) +{ + seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", + kip->intrs[KSTAT_INTR_HARD], + kip->intrs[KSTAT_INTR_SOFT], + kip->intrs[KSTAT_INTR_WATCHDOG], + kip->intrs[KSTAT_INTR_SPURIOUS], + kip->intrs[KSTAT_INTR_MULTSVC]); + + return (0); +} + +static int +kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) +{ + /* though wlentime & friends are signed, they will never be negative */ + seq_printf(f, + "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " + "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + + return (0); +} + +static int +kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) +{ + seq_printf(f, + "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n", + ktp->name, ktp->num_events, ktp->elapsed_time, + ktp->min_time, ktp->max_time, + ktp->start_time, ktp->stop_time); + + return (0); +} + +static int +kstat_seq_show(struct seq_file *f, void *p) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data( + ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + ASSERT(ksp->ks_ndata == 1); + rc = kstat_seq_show_raw(f, ksp->ks_data, + ksp->ks_data_size); + } + break; + case KSTAT_TYPE_NAMED: + rc = kstat_seq_show_named(f, (kstat_named_t *)p); + break; + case KSTAT_TYPE_INTR: + rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); + break; + case KSTAT_TYPE_IO: + rc = kstat_seq_show_io(f, (kstat_io_t *)p); + break; + case KSTAT_TYPE_TIMER: + rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +static void * +kstat_seq_data_addr(kstat_t *ksp, loff_t n) +{ + void *rc = NULL; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + if (ksp->ks_raw_ops.addr) + rc = ksp->ks_raw_ops.addr(ksp, n); + else + rc = ksp->ks_data; + break; + case KSTAT_TYPE_NAMED: + rc = ksp->ks_data + n * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + rc = ksp->ks_data + n * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + rc = ksp->ks_data + n * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + rc = ksp->ks_data + n * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (rc); +} + +static void * +kstat_seq_start(struct seq_file *f, loff_t *pos) +{ + loff_t n = *pos; + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + + if (ksp->ks_type == KSTAT_TYPE_RAW) { + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + } + + /* Dynamically update kstat, on error existing kstats are used */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_snaptime = gethrtime(); + + if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n && + kstat_seq_show_headers(f)) + return (NULL); + + if (n >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, n)); +} + +static void * +kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + ++*pos; + if (*pos >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, *pos)); +} + +static void +kstat_seq_stop(struct seq_file *f, void *v) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + if (ksp->ks_type == KSTAT_TYPE_RAW) + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + + mutex_exit(ksp->ks_lock); +} + +static struct seq_operations kstat_seq_ops = { + .show = kstat_seq_show, + .start = kstat_seq_start, + .next = kstat_seq_next, + .stop = kstat_seq_stop, +}; + +static kstat_module_t * +kstat_find_module(char *name) +{ + kstat_module_t *module; + + list_for_each_entry(module, &kstat_module_list, ksm_module_list) { + if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) + return (module); + } + + return (NULL); +} + +static kstat_module_t * +kstat_create_module(char *name) +{ + kstat_module_t *module; + struct proc_dir_entry *pde; + + pde = proc_mkdir(name, proc_spl_kstat); + if (pde == NULL) + return (NULL); + + module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); + module->ksm_proc = pde; + strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); + INIT_LIST_HEAD(&module->ksm_kstat_list); + list_add_tail(&module->ksm_module_list, &kstat_module_list); + + return (module); + +} + +static void +kstat_delete_module(kstat_module_t *module) +{ + ASSERT(list_empty(&module->ksm_kstat_list)); + remove_proc_entry(module->ksm_name, proc_spl_kstat); + list_del(&module->ksm_module_list); + kmem_free(module, sizeof (kstat_module_t)); +} + +static int +proc_kstat_open(struct inode *inode, struct file *filp) +{ + struct seq_file *f; + int rc; + + rc = seq_open(filp, &kstat_seq_ops); + if (rc) + return (rc); + + f = filp->private_data; + f->private = PDE_DATA(inode); + + return (rc); +} + +static ssize_t +proc_kstat_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + kstat_t *ksp = f->private; + int rc; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + rc = ksp->ks_update(ksp, KSTAT_WRITE); + mutex_exit(ksp->ks_lock); + + if (rc) + return (-rc); + + *ppos += len; + return (len); +} + +static struct file_operations proc_kstat_operations = { + .open = proc_kstat_open, + .write = proc_kstat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} +EXPORT_SYMBOL(__kstat_set_raw_ops); + +void +kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, + const char *name) +{ + kpep->kpe_owner = NULL; + kpep->kpe_proc = NULL; + INIT_LIST_HEAD(&kpep->kpe_list); + strncpy(kpep->kpe_module, module, KSTAT_STRLEN); + strncpy(kpep->kpe_name, name, KSTAT_STRLEN); +} +EXPORT_SYMBOL(kstat_proc_entry_init); + +kstat_t * +__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, uint_t ks_ndata, + uchar_t ks_flags) +{ + kstat_t *ksp; + + ASSERT(ks_module); + ASSERT(ks_instance == 0); + ASSERT(ks_name); + + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); + if (ksp == NULL) + return (ksp); + + mutex_enter(&kstat_module_lock); + ksp->ks_kid = kstat_id; + kstat_id++; + mutex_exit(&kstat_module_lock); + + ksp->ks_magic = KS_MAGIC; + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_instance = ks_instance; + strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = ks_flags; + ksp->ks_update = kstat_default_update; + ksp->ks_private = NULL; + ksp->ks_raw_ops.headers = NULL; + ksp->ks_raw_ops.data = NULL; + ksp->ks_raw_ops.addr = NULL; + ksp->ks_raw_buf = NULL; + ksp->ks_raw_bufsize = 0; + kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + + return (ksp); +} +EXPORT_SYMBOL(__kstat_create); + +static int +kstat_detect_collision(kstat_proc_entry_t *kpep) +{ + kstat_module_t *module; + kstat_proc_entry_t *tmp; + char *parent; + char *cp; + + parent = kmem_asprintf("%s", kpep->kpe_module); + + if ((cp = strrchr(parent, '/')) == NULL) { + strfree(parent); + return (0); + } + + cp[0] = '\0'; + if ((module = kstat_find_module(parent)) != NULL) { + list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { + if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { + strfree(parent); + return (EEXIST); + } + } + } + + strfree(parent); + return (0); +} + +/* + * Add a file to the proc filesystem under the kstat namespace (i.e. + * /proc/spl/kstat/). The file need not necessarily be implemented as a + * kstat. + */ +void +kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, + const struct file_operations *file_ops, void *data) +{ + kstat_module_t *module; + kstat_proc_entry_t *tmp; + + ASSERT(kpep); + + mutex_enter(&kstat_module_lock); + + module = kstat_find_module(kpep->kpe_module); + if (module == NULL) { + if (kstat_detect_collision(kpep) != 0) { + cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ + " collision", kpep->kpe_module, kpep->kpe_name); + goto out; + } + module = kstat_create_module(kpep->kpe_module); + if (module == NULL) + goto out; + } + + /* + * Only one entry by this name per-module, on failure the module + * shouldn't be deleted because we know it has at least one entry. + */ + list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { + if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) + goto out; + } + + list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); + + kpep->kpe_owner = module; + kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, + module->ksm_proc, file_ops, data); + if (kpep->kpe_proc == NULL) { + list_del_init(&kpep->kpe_list); + if (list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + } +out: + mutex_exit(&kstat_module_lock); + +} +EXPORT_SYMBOL(kstat_proc_entry_install); + +void +__kstat_install(kstat_t *ksp) +{ + ASSERT(ksp); + mode_t mode; + /* Specify permission modes for different kstats */ + if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) { + mode = 0600; + } else { + mode = 0644; + } + kstat_proc_entry_install( + &ksp->ks_proc, mode, &proc_kstat_operations, ksp); +} +EXPORT_SYMBOL(__kstat_install); + +void +kstat_proc_entry_delete(kstat_proc_entry_t *kpep) +{ + kstat_module_t *module = kpep->kpe_owner; + if (kpep->kpe_proc) + remove_proc_entry(kpep->kpe_name, module->ksm_proc); + + mutex_enter(&kstat_module_lock); + list_del_init(&kpep->kpe_list); + + /* + * Remove top level module directory if it wasn't empty before, but now + * is. + */ + if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + mutex_exit(&kstat_module_lock); + +} +EXPORT_SYMBOL(kstat_proc_entry_delete); + +void +__kstat_delete(kstat_t *ksp) +{ + kstat_proc_entry_delete(&ksp->ks_proc); + + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); + + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); + kmem_free(ksp, sizeof (*ksp)); +} +EXPORT_SYMBOL(__kstat_delete); + +int +spl_kstat_init(void) +{ + mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&kstat_module_list); + kstat_id = 0; + return (0); +} + +void +spl_kstat_fini(void) +{ + ASSERT(list_empty(&kstat_module_list)); + mutex_destroy(&kstat_module_lock); +} diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c new file mode 100644 index 000000000..a75bcc214 --- /dev/null +++ b/module/os/linux/spl/spl-proc.c @@ -0,0 +1,782 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Proc Implementation. + */ + +#include <sys/systeminfo.h> +#include <sys/kstat.h> +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/vmem.h> +#include <sys/taskq.h> +#include <sys/proc.h> +#include <linux/ctype.h> +#include <linux/kmod.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include <linux/version.h> + +#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +typedef struct ctl_table __no_const spl_ctl_table; +#else +typedef struct ctl_table spl_ctl_table; +#endif + +static unsigned long table_min = 0; +static unsigned long table_max = ~0; + +static struct ctl_table_header *spl_header = NULL; +static struct proc_dir_entry *proc_spl = NULL; +static struct proc_dir_entry *proc_spl_kmem = NULL; +static struct proc_dir_entry *proc_spl_kmem_slab = NULL; +static struct proc_dir_entry *proc_spl_taskq_all = NULL; +static struct proc_dir_entry *proc_spl_taskq = NULL; +struct proc_dir_entry *proc_spl_kstat = NULL; + +static int +proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer, + int ubuffer_size) +{ + int size; + + if (ubuffer_size > kbuffer_size) + return (-EOVERFLOW); + + if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) + return (-EFAULT); + + /* strip trailing whitespace */ + size = strnlen(kbuffer, ubuffer_size); + while (size-- >= 0) + if (!isspace(kbuffer[size])) + break; + + /* empty string */ + if (size < 0) + return (-EINVAL); + + /* no space to terminate */ + if (size == kbuffer_size) + return (-EOVERFLOW); + + kbuffer[size + 1] = 0; + return (0); +} + +static int +proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer, + char *append) +{ + /* + * NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and + * (i.e. a terminating zero byte) for sysctl entries + */ + int size = MIN(strlen(kbuffer), ubuffer_size); + + if (copy_to_user(ubuffer, kbuffer, size)) + return (-EFAULT); + + if (append != NULL && size < ubuffer_size) { + if (copy_to_user(ubuffer + size, append, 1)) + return (-EFAULT); + + size++; + } + + return (size); +} + +#ifdef DEBUG_KMEM +static int +proc_domemused(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val; + spl_ctl_table dummy = *table; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { +#ifdef HAVE_ATOMIC64_T + val = atomic64_read((atomic64_t *)table->data); +#else + val = atomic_read((atomic_t *)table->data); +#endif /* HAVE_ATOMIC64_T */ + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} +#endif /* DEBUG_KMEM */ + +static int +proc_doslab(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val = 0, mask; + spl_ctl_table dummy = *table; + spl_kmem_cache_t *skc; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { + down_read(&spl_kmem_cache_sem); + mask = (unsigned long)table->data; + + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + + /* Only use slabs of the correct kmem/vmem type */ + if (!(skc->skc_flags & mask)) + continue; + + /* Sum the specified field for selected slabs */ + switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { + case KMC_TOTAL: + val += skc->skc_slab_size * skc->skc_slab_total; + break; + case KMC_ALLOC: + val += skc->skc_obj_size * skc->skc_obj_alloc; + break; + case KMC_MAX: + val += skc->skc_obj_size * skc->skc_obj_max; + break; + } + } + + up_read(&spl_kmem_cache_sem); + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} + +static int +proc_dohostid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int len, rc = 0; + char *end, str[32]; + + if (write) { + /* + * We can't use proc_doulongvec_minmax() in the write + * case here because hostid while a hex value has no + * leading 0x which confuses the helper function. + */ + rc = proc_copyin_string(str, sizeof (str), buffer, *lenp); + if (rc < 0) + return (rc); + + spl_hostid = simple_strtoul(str, &end, 16); + if (str == end) + return (-EINVAL); + + } else { + len = snprintf(str, sizeof (str), "%lx", + (unsigned long) zone_get_hostid(NULL)); + if (*ppos >= len) + rc = 0; + else + rc = proc_copyout_string(buffer, + *lenp, str + *ppos, "\n"); + + if (rc >= 0) { + *lenp = rc; + *ppos += rc; + } + } + + return (rc); +} + +static void +taskq_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", + "taskq", "act", "nthr", "spwn", "maxt", "pri", + "mina", "maxa", "cura", "flags"); +} + +/* indices into the lheads array below */ +#define LHEAD_PEND 0 +#define LHEAD_PRIO 1 +#define LHEAD_DELAY 2 +#define LHEAD_WAIT 3 +#define LHEAD_ACTIVE 4 +#define LHEAD_SIZE 5 + +/* BEGIN CSTYLED */ +static unsigned int spl_max_show_tasks = 512; +module_param(spl_max_show_tasks, uint, 0644); +MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); +/* END CSTYLED */ + +static int +taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) +{ + taskq_t *tq = p; + taskq_thread_t *tqt; + spl_wait_queue_entry_t *wq; + struct task_struct *tsk; + taskq_ent_t *tqe; + char name[100]; + struct list_head *lheads[LHEAD_SIZE], *lh; + static char *list_names[LHEAD_SIZE] = + {"pend", "prio", "delay", "wait", "active" }; + int i, j, have_lheads = 0; + unsigned long wflags, flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); + + /* get the various lists and check whether they're empty */ + lheads[LHEAD_PEND] = &tq->tq_pend_list; + lheads[LHEAD_PRIO] = &tq->tq_prio_list; + lheads[LHEAD_DELAY] = &tq->tq_delay_list; +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; +#else + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; +#endif + lheads[LHEAD_ACTIVE] = &tq->tq_active_list; + + for (i = 0; i < LHEAD_SIZE; ++i) { + if (list_empty(lheads[i])) + lheads[i] = NULL; + else + ++have_lheads; + } + + /* early return in non-"all" mode if lists are all empty */ + if (!allflag && !have_lheads) { + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); + } + + /* unlock the waitq quickly */ + if (!lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + + /* show the base taskq contents */ + snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); + seq_printf(f, "%-25s ", name); + seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", + tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, + tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, + tq->tq_nalloc, tq->tq_flags); + + /* show the active list */ + if (lheads[LHEAD_ACTIVE]) { + j = 0; + list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { + if (j == 0) + seq_printf(f, "\t%s:", + list_names[LHEAD_ACTIVE]); + else if (j == 2) { + seq_printf(f, "\n\t "); + j = 0; + } + seq_printf(f, " [%d]%pf(%ps)", + tqt->tqt_thread->pid, + tqt->tqt_task->tqent_func, + tqt->tqt_task->tqent_arg); + ++j; + } + seq_printf(f, "\n"); + } + + for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) + if (lheads[i]) { + j = 0; + list_for_each(lh, lheads[i]) { + if (spl_max_show_tasks != 0 && + j >= spl_max_show_tasks) { + seq_printf(f, "\n\t(truncated)"); + break; + } + /* show the wait waitq list */ + if (i == LHEAD_WAIT) { +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + wq = list_entry(lh, + spl_wait_queue_entry_t, entry); +#else + wq = list_entry(lh, + spl_wait_queue_entry_t, task_list); +#endif + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 8 == 0) + seq_printf(f, "\n\t "); + + tsk = wq->private; + seq_printf(f, " %d", tsk->pid); + /* pend, prio and delay lists */ + } else { + tqe = list_entry(lh, taskq_ent_t, + tqent_list); + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 2 == 0) + seq_printf(f, "\n\t "); + + seq_printf(f, " %pf(%ps)", + tqe->tqent_func, + tqe->tqent_arg); + } + ++j; + } + seq_printf(f, "\n"); + } + if (lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (0); +} + +static int +taskq_all_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_TRUE)); +} + +static int +taskq_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_FALSE)); +} + +static void * +taskq_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&tq_list_sem); + if (!n) + taskq_seq_show_headers(f); + + p = tq_list.next; + while (n--) { + p = p->next; + if (p == &tq_list) + return (NULL); + } + + return (list_entry(p, taskq_t, tq_taskqs)); +} + +static void * +taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + taskq_t *tq = p; + + ++*pos; + return ((tq->tq_taskqs.next == &tq_list) ? + NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); +} + +static void +slab_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, + "--------------------- cache ----------" + "--------------------------------------------- " + "----- slab ------ " + "---- object ----- " + "--- emergency ---\n"); + seq_printf(f, + "name " + " flags size alloc slabsize objsize " + "total alloc max " + "total alloc max " + "dlock alloc max\n"); +} + +static int +slab_seq_show(struct seq_file *f, void *p) +{ + spl_kmem_cache_t *skc = p; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Backed by Linux slab see /proc/slabinfo. + */ + if (skc->skc_flags & KMC_SLAB) + return (0); + + spin_lock(&skc->skc_lock); + seq_printf(f, "%-36s ", skc->skc_name); + seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " + "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", + (long unsigned)skc->skc_flags, + (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), + (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), + (unsigned)skc->skc_slab_size, + (unsigned)skc->skc_obj_size, + (long unsigned)skc->skc_slab_total, + (long unsigned)skc->skc_slab_alloc, + (long unsigned)skc->skc_slab_max, + (long unsigned)skc->skc_obj_total, + (long unsigned)skc->skc_obj_alloc, + (long unsigned)skc->skc_obj_max, + (long unsigned)skc->skc_obj_deadlock, + (long unsigned)skc->skc_obj_emergency, + (long unsigned)skc->skc_obj_emergency_max); + + spin_unlock(&skc->skc_lock); + + return (0); +} + +static void * +slab_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&spl_kmem_cache_sem); + if (!n) + slab_seq_show_headers(f); + + p = spl_kmem_cache_list.next; + while (n--) { + p = p->next; + if (p == &spl_kmem_cache_list) + return (NULL); + } + + return (list_entry(p, spl_kmem_cache_t, skc_list)); +} + +static void * +slab_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + spl_kmem_cache_t *skc = p; + + ++*pos; + return ((skc->skc_list.next == &spl_kmem_cache_list) ? + NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list)); +} + +static void +slab_seq_stop(struct seq_file *f, void *v) +{ + up_read(&spl_kmem_cache_sem); +} + +static struct seq_operations slab_seq_ops = { + .show = slab_seq_show, + .start = slab_seq_start, + .next = slab_seq_next, + .stop = slab_seq_stop, +}; + +static int +proc_slab_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &slab_seq_ops)); +} + +static struct file_operations proc_slab_operations = { + .open = proc_slab_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void +taskq_seq_stop(struct seq_file *f, void *v) +{ + up_read(&tq_list_sem); +} + +static struct seq_operations taskq_all_seq_ops = { + .show = taskq_all_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static struct seq_operations taskq_seq_ops = { + .show = taskq_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static int +proc_taskq_all_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_all_seq_ops)); +} + +static int +proc_taskq_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_seq_ops)); +} + +static struct file_operations proc_taskq_all_operations = { + .open = proc_taskq_all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations proc_taskq_operations = { + .open = proc_taskq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct ctl_table spl_kmem_table[] = { +#ifdef DEBUG_KMEM + { + .procname = "kmem_used", + .data = &kmem_alloc_used, +#ifdef HAVE_ATOMIC64_T + .maxlen = sizeof (atomic64_t), +#else + .maxlen = sizeof (atomic_t), +#endif /* HAVE_ATOMIC64_T */ + .mode = 0444, + .proc_handler = &proc_domemused, + }, + { + .procname = "kmem_max", + .data = &kmem_alloc_max, + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif /* DEBUG_KMEM */ + { + .procname = "slab_kmem_total", + .data = (void *)(KMC_KMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kmem_alloc", + .data = (void *)(KMC_KMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kmem_max", + .data = (void *)(KMC_KMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_total", + .data = (void *)(KMC_VMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_alloc", + .data = (void *)(KMC_VMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_vmem_max", + .data = (void *)(KMC_VMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + {}, +}; + +static struct ctl_table spl_kstat_table[] = { + {}, +}; + +static struct ctl_table spl_table[] = { + /* + * NB No .strategy entries have been provided since + * sysctl(8) prefers to go via /proc for portability. + */ + { + .procname = "gitrev", + .data = spl_gitrev, + .maxlen = sizeof (spl_gitrev), + .mode = 0444, + .proc_handler = &proc_dostring, + }, + { + .procname = "hostid", + .data = &spl_hostid, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dohostid, + }, + { + .procname = "kmem", + .mode = 0555, + .child = spl_kmem_table, + }, + { + .procname = "kstat", + .mode = 0555, + .child = spl_kstat_table, + }, + {}, +}; + +static struct ctl_table spl_dir[] = { + { + .procname = "spl", + .mode = 0555, + .child = spl_table, + }, + {} +}; + +static struct ctl_table spl_root[] = { + { +#ifdef HAVE_CTL_NAME + .ctl_name = CTL_KERN, +#endif + .procname = "kernel", + .mode = 0555, + .child = spl_dir, + }, + {} +}; + +int +spl_proc_init(void) +{ + int rc = 0; + + spl_header = register_sysctl_table(spl_root); + if (spl_header == NULL) + return (-EUNATCH); + + proc_spl = proc_mkdir("spl", NULL); + if (proc_spl == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, + &proc_taskq_all_operations, NULL); + if (proc_spl_taskq_all == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, + &proc_taskq_operations, NULL); + if (proc_spl_taskq == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem = proc_mkdir("kmem", proc_spl); + if (proc_spl_kmem == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem, + &proc_slab_operations, NULL); + if (proc_spl_kmem_slab == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kstat = proc_mkdir("kstat", proc_spl); + if (proc_spl_kstat == NULL) { + rc = -EUNATCH; + goto out; + } +out: + if (rc) { + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + unregister_sysctl_table(spl_header); + } + + return (rc); +} + +void +spl_proc_fini(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + + ASSERT(spl_header != NULL); + unregister_sysctl_table(spl_header); +} diff --git a/module/os/linux/spl/spl-procfs-list.c b/module/os/linux/spl/spl-procfs-list.c new file mode 100644 index 000000000..f6a00da5c --- /dev/null +++ b/module/os/linux/spl/spl-procfs-list.c @@ -0,0 +1,257 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include <sys/list.h> +#include <sys/mutex.h> +#include <sys/procfs_list.h> +#include <linux/proc_fs.h> + +/* + * A procfs_list is a wrapper around a linked list which implements the seq_file + * interface, allowing the contents of the list to be exposed through procfs. + * The kernel already has some utilities to help implement the seq_file + * interface for linked lists (seq_list_*), but they aren't appropriate for use + * with lists that have many entries, because seq_list_start walks the list at + * the start of each read syscall to find where it left off, so reading a file + * ends up being quadratic in the number of entries in the list. + * + * This implementation avoids this penalty by maintaining a separate cursor into + * the list per instance of the file that is open. It also maintains some extra + * information in each node of the list to prevent reads of entries that have + * been dropped from the list. + * + * Callers should only add elements to the list using procfs_list_add, which + * adds an element to the tail of the list. Other operations can be performed + * directly on the wrapped list using the normal list manipulation functions, + * but elements should only be removed from the head of the list. + */ + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +typedef struct procfs_list_cursor { + procfs_list_t *procfs_list; /* List into which this cursor points */ + void *cached_node; /* Most recently accessed node */ + loff_t cached_pos; /* Position of cached_node */ +} procfs_list_cursor_t; + +static int +procfs_list_seq_show(struct seq_file *f, void *p) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + if (p == SEQ_START_TOKEN) { + if (procfs_list->pl_show_header != NULL) + return (procfs_list->pl_show_header(f)); + else + return (0); + } + return (procfs_list->pl_show(f, p)); +} + +static void * +procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos) +{ + void *next_node; + procfs_list_t *procfs_list = cursor->procfs_list; + + if (cursor->cached_node == SEQ_START_TOKEN) + next_node = list_head(&procfs_list->pl_list); + else + next_node = list_next(&procfs_list->pl_list, + cursor->cached_node); + + if (next_node != NULL) { + cursor->cached_node = next_node; + cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node); + *pos = cursor->cached_pos; + } + return (next_node); +} + +static void * +procfs_list_seq_start(struct seq_file *f, loff_t *pos) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + + mutex_enter(&procfs_list->pl_lock); + + if (*pos == 0) { + cursor->cached_node = SEQ_START_TOKEN; + cursor->cached_pos = 0; + return (SEQ_START_TOKEN); + } + + /* + * Check if our cached pointer has become stale, which happens if the + * the message where we left off has been dropped from the list since + * the last read syscall completed. + */ + void *oldest_node = list_head(&procfs_list->pl_list); + if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL || + NODE_ID(procfs_list, oldest_node) > cursor->cached_pos)) + return (ERR_PTR(-EIO)); + + /* + * If it isn't starting from the beginning of the file, the seq_file + * code will either pick up at the same position it visited last or the + * following one. + */ + if (*pos == cursor->cached_pos) { + return (cursor->cached_node); + } else { + ASSERT3U(*pos, ==, cursor->cached_pos + 1); + return (procfs_list_next_node(cursor, pos)); + } +} + +static void * +procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + procfs_list_cursor_t *cursor = f->private; + ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock)); + return (procfs_list_next_node(cursor, pos)); +} + +static void +procfs_list_seq_stop(struct seq_file *f, void *p) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + mutex_exit(&procfs_list->pl_lock); +} + +static struct seq_operations procfs_list_seq_ops = { + .show = procfs_list_seq_show, + .start = procfs_list_seq_start, + .next = procfs_list_seq_next, + .stop = procfs_list_seq_stop, +}; + +static int +procfs_list_open(struct inode *inode, struct file *filp) +{ + int rc = seq_open_private(filp, &procfs_list_seq_ops, + sizeof (procfs_list_cursor_t)); + if (rc != 0) + return (rc); + + struct seq_file *f = filp->private_data; + procfs_list_cursor_t *cursor = f->private; + cursor->procfs_list = PDE_DATA(inode); + cursor->cached_node = NULL; + cursor->cached_pos = 0; + + return (0); +} + +static ssize_t +procfs_list_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + int rc; + + if (procfs_list->pl_clear != NULL && + (rc = procfs_list->pl_clear(procfs_list)) != 0) + return (-rc); + return (len); +} + +static struct file_operations procfs_list_operations = { + .owner = THIS_MODULE, + .open = procfs_list_open, + .write = procfs_list_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Initialize a procfs_list and create a file for it in the proc filesystem + * under the kstat namespace. + */ +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */ + procfs_list->pl_show = show; + procfs_list->pl_show_header = show_header; + procfs_list->pl_clear = clear; + procfs_list->pl_node_offset = procfs_list_node_off; + + kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name); + kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode, + &procfs_list_operations, procfs_list); +} +EXPORT_SYMBOL(procfs_list_install); + +/* Remove the proc filesystem file corresponding to the given list */ +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{ + kstat_proc_entry_delete(&procfs_list->pl_kstat_entry); +} +EXPORT_SYMBOL(procfs_list_uninstall); + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} +EXPORT_SYMBOL(procfs_list_destroy); + +/* + * Add a new node to the tail of the list. While the standard list manipulation + * functions can be use for all other operation, adding elements to the list + * should only be done using this helper so that the id of the new node is set + * correctly. + */ +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} +EXPORT_SYMBOL(procfs_list_add); diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c new file mode 100644 index 000000000..90e1d0a4d --- /dev/null +++ b/module/os/linux/spl/spl-taskq.c @@ -0,0 +1,1292 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Task Queue Implementation. + */ + +#include <sys/timer.h> +#include <sys/taskq.h> +#include <sys/kmem.h> +#include <sys/tsd.h> +#include <sys/simd.h> + +int spl_taskq_thread_bind = 0; +module_param(spl_taskq_thread_bind, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); + + +int spl_taskq_thread_dynamic = 1; +module_param(spl_taskq_thread_dynamic, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); + +int spl_taskq_thread_priority = 1; +module_param(spl_taskq_thread_priority, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_priority, + "Allow non-default priority for taskq threads"); + +int spl_taskq_thread_sequential = 4; +module_param(spl_taskq_thread_sequential, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_sequential, + "Create new taskq threads after N sequential tasks"); + +/* Global system-wide dynamic task queue available for all consumers */ +taskq_t *system_taskq; +EXPORT_SYMBOL(system_taskq); +/* Global dynamic task queue for long delay */ +taskq_t *system_delay_taskq; +EXPORT_SYMBOL(system_delay_taskq); + +/* Private dedicated taskq for creating new taskq threads on demand. */ +static taskq_t *dynamic_taskq; +static taskq_thread_t *taskq_thread_create(taskq_t *); + +/* List of all taskqs */ +LIST_HEAD(tq_list); +struct rw_semaphore tq_list_sem; +static uint_t taskq_tsd; + +static int +task_km_flags(uint_t flags) +{ + if (flags & TQ_NOSLEEP) + return (KM_NOSLEEP); + + if (flags & TQ_PUSHPAGE) + return (KM_PUSHPAGE); + + return (KM_SLEEP); +} + +/* + * taskq_find_by_name - Find the largest instance number of a named taskq. + */ +static int +taskq_find_by_name(const char *name) +{ + struct list_head *tql; + taskq_t *tq; + + list_for_each_prev(tql, &tq_list) { + tq = list_entry(tql, taskq_t, tq_taskqs); + if (strcmp(name, tq->tq_name) == 0) + return (tq->tq_instance); + } + return (-1); +} + +/* + * NOTE: Must be called with tq->tq_lock held, returns a list_t which + * is not attached to the free, work, or pending taskq lists. + */ +static taskq_ent_t * +task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) +{ + taskq_ent_t *t; + int count = 0; + + ASSERT(tq); +retry: + /* Acquire taskq_ent_t's from free list if available */ + if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); + ASSERT(!timer_pending(&t->tqent_timer)); + + list_del_init(&t->tqent_list); + return (t); + } + + /* Free list is empty and memory allocations are prohibited */ + if (flags & TQ_NOALLOC) + return (NULL); + + /* Hit maximum taskq_ent_t pool size */ + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (flags & TQ_NOSLEEP) + return (NULL); + + /* + * Sleep periodically polling the free list for an available + * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed + * but we cannot block forever waiting for an taskq_ent_t to + * show up in the free list, otherwise a deadlock can happen. + * + * Therefore, we need to allocate a new task even if the number + * of allocated tasks is above tq->tq_maxalloc, but we still + * end up delaying the task allocation by one second, thereby + * throttling the task dispatch rate. + */ + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + schedule_timeout(HZ / 100); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, + tq->tq_lock_class); + if (count < 100) { + count++; + goto retry; + } + } + + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); + + if (t) { + taskq_init_ent(t); + tq->tq_nalloc++; + } + + return (t); +} + +/* + * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t + * to already be removed from the free, work, or pending taskq lists. + */ +static void +task_free(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + ASSERT(list_empty(&t->tqent_list)); + ASSERT(!timer_pending(&t->tqent_timer)); + + kmem_free(t, sizeof (taskq_ent_t)); + tq->tq_nalloc--; +} + +/* + * NOTE: Must be called with tq->tq_lock held, either destroys the + * taskq_ent_t if too many exist or moves it to the free list for later use. + */ +static void +task_done(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + + /* Wake tasks blocked in taskq_wait_id() */ + wake_up_all(&t->tqent_waitq); + + list_del_init(&t->tqent_list); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->tqent_id = TASKQID_INVALID; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + + list_add_tail(&t->tqent_list, &tq->tq_free_list); + } else { + task_free(tq, t); + } +} + +/* + * When a delayed task timer expires remove it from the delay list and + * add it to the priority list in order for immediate processing. + */ +static void +task_expire_impl(taskq_ent_t *t) +{ + taskq_ent_t *w; + taskq_t *tq = t->tqent_taskq; + struct list_head *l; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (t->tqent_flags & TQENT_FLAG_CANCEL) { + ASSERT(list_empty(&t->tqent_list)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return; + } + + t->tqent_birth = jiffies; + /* + * The priority list must be maintained in strict task id order + * from lowest to highest for lowest_id to be easily calculable. + */ + list_del(&t->tqent_list); + list_for_each_prev(l, &tq->tq_prio_list) { + w = list_entry(l, taskq_ent_t, tqent_list); + if (w->tqent_id < t->tqent_id) { + list_add(&t->tqent_list, l); + break; + } + } + if (l == &tq->tq_prio_list) + list_add(&t->tqent_list, &tq->tq_prio_list); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + wake_up(&tq->tq_work_waitq); +} + +static void +task_expire(spl_timer_list_t tl) +{ + struct timer_list *tmr = (struct timer_list *)tl; + taskq_ent_t *t = from_timer(t, tmr, tqent_timer); + task_expire_impl(t); +} + +/* + * Returns the lowest incomplete taskqid_t. The taskqid_t may + * be queued on the pending list, on the priority list, on the + * delay list, or on the work list currently being handled, but + * it is not 100% complete yet. + */ +static taskqid_t +taskq_lowest_id(taskq_t *tq) +{ + taskqid_t lowest_id = tq->tq_next_id; + taskq_ent_t *t; + taskq_thread_t *tqt; + + ASSERT(tq); + + if (!list_empty(&tq->tq_pend_list)) { + t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_prio_list)) { + t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_delay_list)) { + t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_active_list)) { + tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, + tqt_active_list); + ASSERT(tqt->tqt_id != TASKQID_INVALID); + lowest_id = MIN(lowest_id, tqt->tqt_id); + } + + return (lowest_id); +} + +/* + * Insert a task into a list keeping the list sorted by increasing taskqid. + */ +static void +taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) +{ + taskq_thread_t *w; + struct list_head *l; + + ASSERT(tq); + ASSERT(tqt); + + list_for_each_prev(l, &tq->tq_active_list) { + w = list_entry(l, taskq_thread_t, tqt_active_list); + if (w->tqt_id < tqt->tqt_id) { + list_add(&tqt->tqt_active_list, l); + break; + } + } + if (l == &tq->tq_active_list) + list_add(&tqt->tqt_active_list, &tq->tq_active_list); +} + +/* + * Find and return a task from the given list if it exists. The list + * must be in lowest to highest task id order. + */ +static taskq_ent_t * +taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) +{ + struct list_head *l; + taskq_ent_t *t; + + list_for_each(l, lh) { + t = list_entry(l, taskq_ent_t, tqent_list); + + if (t->tqent_id == id) + return (t); + + if (t->tqent_id > id) + break; + } + + return (NULL); +} + +/* + * Find an already dispatched task given the task id regardless of what + * state it is in. If a task is still pending it will be returned. + * If a task is executing, then -EBUSY will be returned instead. + * If the task has already been run then NULL is returned. + */ +static taskq_ent_t * +taskq_find(taskq_t *tq, taskqid_t id) +{ + taskq_thread_t *tqt; + struct list_head *l; + taskq_ent_t *t; + + t = taskq_find_list(tq, &tq->tq_delay_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_prio_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_pend_list, id); + if (t) + return (t); + + list_for_each(l, &tq->tq_active_list) { + tqt = list_entry(l, taskq_thread_t, tqt_active_list); + if (tqt->tqt_id == id) { + /* + * Instead of returning tqt_task, we just return a non + * NULL value to prevent misuse, since tqt_task only + * has two valid fields. + */ + return (ERR_PTR(-EBUSY)); + } + } + + return (NULL); +} + +/* + * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and + * taskq_wait() functions below. + * + * Taskq waiting is accomplished by tracking the lowest outstanding task + * id and the next available task id. As tasks are dispatched they are + * added to the tail of the pending, priority, or delay lists. As worker + * threads become available the tasks are removed from the heads of these + * lists and linked to the worker threads. This ensures the lists are + * kept sorted by lowest to highest task id. + * + * Therefore the lowest outstanding task id can be quickly determined by + * checking the head item from all of these lists. This value is stored + * with the taskq as the lowest id. It only needs to be recalculated when + * either the task with the current lowest id completes or is canceled. + * + * By blocking until the lowest task id exceeds the passed task id the + * taskq_wait_outstanding() function can be easily implemented. Similarly, + * by blocking until the lowest task id matches the next task id taskq_wait() + * can be implemented. + * + * Callers should be aware that when there are multiple worked threads it + * is possible for larger task ids to complete before smaller ones. Also + * when the taskq contains delay tasks with small task ids callers may + * block for a considerable length of time waiting for them to expire and + * execute. + */ +static int +taskq_wait_id_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (taskq_find(tq, id) == NULL); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_id() function blocks until the passed task id completes. + * This does not guarantee that all lower task ids have completed. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_id); + +static int +taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (id < tq->tq_lowest_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_outstanding() function will block until all tasks with a + * lower taskqid than the passed 'id' have been completed. Note that all + * task id's are assigned monotonically at dispatch time. Zero may be + * passed for the id to indicate all tasks dispatch up to this point, + * but not after, should be waited for. + */ +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + id = id ? id : tq->tq_next_id - 1; + wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_outstanding); + +static int +taskq_wait_check(taskq_t *tq) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (tq->tq_lowest_id == tq->tq_next_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait() function will block until the taskq is empty. + * This means that if a taskq re-dispatches work to itself taskq_wait() + * callers will block indefinitely. + */ +void +taskq_wait(taskq_t *tq) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); +} +EXPORT_SYMBOL(taskq_wait); + +int +taskq_member(taskq_t *tq, kthread_t *t) +{ + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); +} +EXPORT_SYMBOL(taskq_member); + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + taskq_ent_t *t; + int rc = ENOENT; + unsigned long flags; + + ASSERT(tq); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + t = taskq_find(tq, id); + if (t && t != ERR_PTR(-EBUSY)) { + list_del_init(&t->tqent_list); + t->tqent_flags |= TQENT_FLAG_CANCEL; + + /* + * When canceling the lowest outstanding task id we + * must recalculate the new lowest outstanding id. + */ + if (tq->tq_lowest_id == t->tqent_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); + } + + /* + * The task_expire() function takes the tq->tq_lock so drop + * drop the lock before synchronously cancelling the timer. + */ + if (timer_pending(&t->tqent_timer)) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + del_timer_sync(&t->tqent_timer); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + rc = 0; + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + + if (t == ERR_PTR(-EBUSY)) { + taskq_wait_id(tq, id); + rc = EBUSY; + } + + return (rc); +} +EXPORT_SYMBOL(taskq_cancel_id); + +static int taskq_thread_spawn(taskq_t *tq); + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *t; + taskqid_t rc = TASKQID_INVALID; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + /* Do not queue the task unless there is idle thread for it */ + ASSERT(tq->tq_nactive <= tq->tq_nthreads); + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out; + } + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ + if (flags & TQ_NOQUEUE) + list_add(&t->tqent_list, &tq->tq_prio_list); + /* Queue to the priority list instead of the pending list */ + else if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_timer.function = NULL; + t->tqent_timer.expires = 0; + t->tqent_birth = jiffies; + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch); + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskqid_t rc = TASKQID_INVALID; + taskq_ent_t *t; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the delay list for subsequent execution */ + list_add_tail(&t->tqent_list, &tq->tq_delay_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_timer.function = task_expire; + t->tqent_timer.expires = (unsigned long)expire_time; + add_timer(&t->tqent_timer); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch_delay); + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) { + t->tqent_id = TASKQID_INVALID; + goto out; + } + + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out2; + flags |= TQ_FRONT; + } + + spin_lock(&t->tqent_lock); + + /* + * Make sure the entry is not on some other taskq; it is important to + * ASSERT() under lock + */ + ASSERT(taskq_empty_ent(t)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + + /* Queue to the priority list instead of the pending list */ + if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_birth = jiffies; + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); +out2: + spin_unlock_irqrestore(&tq->tq_lock, irqflags); +} +EXPORT_SYMBOL(taskq_dispatch_ent); + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (list_empty(&t->tqent_list)); +} +EXPORT_SYMBOL(taskq_empty_ent); + +void +taskq_init_ent(taskq_ent_t *t) +{ + spin_lock_init(&t->tqent_lock); + init_waitqueue_head(&t->tqent_waitq); + timer_setup(&t->tqent_timer, NULL, 0); + INIT_LIST_HEAD(&t->tqent_list); + t->tqent_id = 0; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + t->tqent_taskq = NULL; +} +EXPORT_SYMBOL(taskq_init_ent); + +/* + * Return the next pending task, preference is given to tasks on the + * priority list which were dispatched with TQ_FRONT. + */ +static taskq_ent_t * +taskq_next_ent(taskq_t *tq) +{ + struct list_head *list; + + if (!list_empty(&tq->tq_prio_list)) + list = &tq->tq_prio_list; + else if (!list_empty(&tq->tq_pend_list)) + list = &tq->tq_pend_list; + else + return (NULL); + + return (list_entry(list->next, taskq_ent_t, tqent_list)); +} + +/* + * Spawns a new thread for the specified taskq. + */ +static void +taskq_thread_spawn_task(void *arg) +{ + taskq_t *tq = (taskq_t *)arg; + unsigned long flags; + + if (taskq_thread_create(tq) == NULL) { + /* restore spawning count if failed */ + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nspawn--; + spin_unlock_irqrestore(&tq->tq_lock, flags); + } +} + +/* + * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current + * number of threads is insufficient to handle the pending tasks. These + * new threads must be created by the dedicated dynamic_taskq to avoid + * deadlocks between thread creation and memory reclaim. The system_taskq + * which is also a dynamic taskq cannot be safely used for this. + */ +static int +taskq_thread_spawn(taskq_t *tq) +{ + int spawning = 0; + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && + (tq->tq_flags & TASKQ_ACTIVE)) { + spawning = (++tq->tq_nspawn); + taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, + tq, TQ_NOSLEEP); + } + + return (spawning); +} + +/* + * Threads in a dynamic taskq should only exit once it has been completely + * drained and no other threads are actively servicing tasks. This prevents + * threads from being created and destroyed more than is required. + * + * The first thread is the thread list is treated as the primary thread. + * There is nothing special about the primary thread but in order to avoid + * all the taskq pids from changing we opt to make it long running. + */ +static int +taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) +{ + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, + tqt_thread_list) == tqt) + return (0); + + return + ((tq->tq_nspawn == 0) && /* No threads are being spawned */ + (tq->tq_nactive == 0) && /* No threads are handling tasks */ + (tq->tq_nthreads > 1) && /* More than 1 thread is running */ + (!taskq_next_ent(tq)) && /* There are no pending tasks */ + (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ +} + +static int +taskq_thread(void *args) +{ + DECLARE_WAITQUEUE(wait, current); + sigset_t blocked; + taskq_thread_t *tqt = args; + taskq_t *tq; + taskq_ent_t *t; + int seq_tasks = 0; + unsigned long flags; + taskq_ent_t dup_task = {}; + + ASSERT(tqt); + ASSERT(tqt->tqt_tq); + tq = tqt->tqt_tq; + current->flags |= PF_NOFREEZE; + + (void) spl_fstrans_mark(); + + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + kfpu_initialize(); + + tsd_set(taskq_tsd, tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* + * If we are dynamically spawned, decrease spawning count. Note that + * we could be created during taskq_create, in which case we shouldn't + * do the decrement. But it's fine because taskq_create will reset + * tq_nspawn later. + */ + if (tq->tq_flags & TASKQ_DYNAMIC) + tq->tq_nspawn--; + + /* Immediately exit if more threads than allowed were created. */ + if (tq->tq_nthreads >= tq->tq_maxthreads) + goto error; + + tq->tq_nthreads++; + list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); + wake_up(&tq->tq_wait_waitq); + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (list_empty(&tq->tq_pend_list) && + list_empty(&tq->tq_prio_list)) { + + if (taskq_thread_should_stop(tq, tqt)) { + wake_up_all(&tq->tq_wait_waitq); + break; + } + + add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + schedule(); + seq_tasks = 0; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + remove_wait_queue(&tq->tq_work_waitq, &wait); + } else { + __set_current_state(TASK_RUNNING); + } + + if ((t = taskq_next_ent(tq)) != NULL) { + list_del_init(&t->tqent_list); + + /* + * A TQENT_FLAG_PREALLOC task may be reused or freed + * during the task function call. Store tqent_id and + * tqent_flags here. + * + * Also use an on stack taskq_ent_t for tqt_task + * assignment in this case. We only populate the two + * fields used by the only user in taskq proc file. + */ + tqt->tqt_id = t->tqent_id; + tqt->tqt_flags = t->tqent_flags; + + if (t->tqent_flags & TQENT_FLAG_PREALLOC) { + dup_task.tqent_func = t->tqent_func; + dup_task.tqent_arg = t->tqent_arg; + t = &dup_task; + } + tqt->tqt_task = t; + + taskq_insert_in_order(tq, tqt); + tq->tq_nactive++; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* Perform the requested task */ + t->tqent_func(t->tqent_arg); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nactive--; + list_del_init(&tqt->tqt_active_list); + tqt->tqt_task = NULL; + + /* For prealloc'd tasks, we don't free anything. */ + if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + /* + * When the current lowest outstanding taskqid is + * done calculate the new lowest outstanding id + */ + if (tq->tq_lowest_id == tqt->tqt_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); + } + + /* Spawn additional taskq threads if required. */ + if ((++seq_tasks) > spl_taskq_thread_sequential && + taskq_thread_spawn(tq)) + seq_tasks = 0; + + tqt->tqt_id = TASKQID_INVALID; + tqt->tqt_flags = 0; + wake_up_all(&tq->tq_wait_waitq); + } else { + if (taskq_thread_should_stop(tq, tqt)) + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + } + + __set_current_state(TASK_RUNNING); + tq->tq_nthreads--; + list_del_init(&tqt->tqt_thread_list); +error: + kmem_free(tqt, sizeof (taskq_thread_t)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + tsd_set(taskq_tsd, NULL); + + return (0); +} + +static taskq_thread_t * +taskq_thread_create(taskq_t *tq) +{ + static int last_used_cpu = 0; + taskq_thread_t *tqt; + + tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); + INIT_LIST_HEAD(&tqt->tqt_thread_list); + INIT_LIST_HEAD(&tqt->tqt_active_list); + tqt->tqt_tq = tq; + tqt->tqt_id = TASKQID_INVALID; + + tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, + "%s", tq->tq_name); + if (tqt->tqt_thread == NULL) { + kmem_free(tqt, sizeof (taskq_thread_t)); + return (NULL); + } + + if (spl_taskq_thread_bind) { + last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); + kthread_bind(tqt->tqt_thread, last_used_cpu); + } + + if (spl_taskq_thread_priority) + set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); + + wake_up_process(tqt->tqt_thread); + + return (tqt); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int count = 0, rc = 0, i; + unsigned long irqflags; + + ASSERT(name != NULL); + ASSERT(minalloc >= 0); + ASSERT(maxalloc <= INT_MAX); + ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ + + /* Scale the number of threads using nthreads as a percentage */ + if (flags & TASKQ_THREADS_CPU_PCT) { + ASSERT(nthreads <= 100); + ASSERT(nthreads >= 0); + nthreads = MIN(nthreads, 100); + nthreads = MAX(nthreads, 0); + nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + } + + tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); + if (tq == NULL) + return (NULL); + + spin_lock_init(&tq->tq_lock); + INIT_LIST_HEAD(&tq->tq_thread_list); + INIT_LIST_HEAD(&tq->tq_active_list); + tq->tq_name = strdup(name); + tq->tq_nactive = 0; + tq->tq_nthreads = 0; + tq->tq_nspawn = 0; + tq->tq_maxthreads = nthreads; + tq->tq_pri = pri; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nalloc = 0; + tq->tq_flags = (flags | TASKQ_ACTIVE); + tq->tq_next_id = TASKQID_INITIAL; + tq->tq_lowest_id = TASKQID_INITIAL; + INIT_LIST_HEAD(&tq->tq_free_list); + INIT_LIST_HEAD(&tq->tq_pend_list); + INIT_LIST_HEAD(&tq->tq_prio_list); + INIT_LIST_HEAD(&tq->tq_delay_list); + init_waitqueue_head(&tq->tq_work_waitq); + init_waitqueue_head(&tq->tq_wait_waitq); + tq->tq_lock_class = TQ_LOCK_GENERAL; + INIT_LIST_HEAD(&tq->tq_taskqs); + + if (flags & TASKQ_PREPOPULATE) { + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + for (i = 0; i < minalloc; i++) + task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, + &irqflags)); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + } + + if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) + nthreads = 1; + + for (i = 0; i < nthreads; i++) { + tqt = taskq_thread_create(tq); + if (tqt == NULL) + rc = 1; + else + count++; + } + + /* Wait for all threads to be started before potential destroy */ + wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); + /* + * taskq_thread might have touched nspawn, but we don't want them to + * because they're not dynamically spawned. So we reset it to 0 + */ + tq->tq_nspawn = 0; + + if (rc) { + taskq_destroy(tq); + tq = NULL; + } else { + down_write(&tq_list_sem); + tq->tq_instance = taskq_find_by_name(name) + 1; + list_add_tail(&tq->tq_taskqs, &tq_list); + up_write(&tq_list_sem); + } + + return (tq); +} +EXPORT_SYMBOL(taskq_create); + +void +taskq_destroy(taskq_t *tq) +{ + struct task_struct *thread; + taskq_thread_t *tqt; + taskq_ent_t *t; + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + tq->tq_flags &= ~TASKQ_ACTIVE; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* + * When TASKQ_ACTIVE is clear new tasks may not be added nor may + * new worker threads be spawned for dynamic taskq. + */ + if (dynamic_taskq != NULL) + taskq_wait_outstanding(dynamic_taskq, 0); + + taskq_wait(tq); + + /* remove taskq from global list used by the kstats */ + down_write(&tq_list_sem); + list_del(&tq->tq_taskqs); + up_write(&tq_list_sem); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* wait for spawning threads to insert themselves to the list */ + while (tq->tq_nspawn) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + schedule_timeout_interruptible(1); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + /* + * Signal each thread to exit and block until it does. Each thread + * is responsible for removing itself from the list and freeing its + * taskq_thread_t. This allows for idle threads to opt to remove + * themselves from the taskq. They can be recreated as needed. + */ + while (!list_empty(&tq->tq_thread_list)) { + tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + while (!list_empty(&tq->tq_free_list)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + list_del_init(&t->tqent_list); + task_free(tq, t); + } + + ASSERT0(tq->tq_nthreads); + ASSERT0(tq->tq_nalloc); + ASSERT0(tq->tq_nspawn); + ASSERT(list_empty(&tq->tq_thread_list)); + ASSERT(list_empty(&tq->tq_active_list)); + ASSERT(list_empty(&tq->tq_free_list)); + ASSERT(list_empty(&tq->tq_pend_list)); + ASSERT(list_empty(&tq->tq_prio_list)); + ASSERT(list_empty(&tq->tq_delay_list)); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + strfree(tq->tq_name); + kmem_free(tq, sizeof (taskq_t)); +} +EXPORT_SYMBOL(taskq_destroy); + + +static unsigned int spl_taskq_kick = 0; + +/* + * 2.6.36 API Change + * module_param_cb is introduced to take kernel_param_ops and + * module_param_call is marked as obsolete. Also set and get operations + * were changed to take a 'const struct kernel_param *'. + */ +static int +#ifdef module_param_cb +param_set_taskq_kick(const char *val, const struct kernel_param *kp) +#else +param_set_taskq_kick(const char *val, struct kernel_param *kp) +#endif +{ + int ret; + taskq_t *tq; + taskq_ent_t *t; + unsigned long flags; + + ret = param_set_uint(val, kp); + if (ret < 0 || !spl_taskq_kick) + return (ret); + /* reset value */ + spl_taskq_kick = 0; + + down_read(&tq_list_sem); + list_for_each_entry(tq, &tq_list, tq_taskqs) { + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + /* Check if the first pending is older than 5 seconds */ + t = taskq_next_ent(tq); + if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { + (void) taskq_thread_spawn(tq); + printk(KERN_INFO "spl: Kicked taskq %s/%d\n", + tq->tq_name, tq->tq_instance); + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + } + up_read(&tq_list_sem); + return (ret); +} + +#ifdef module_param_cb +static const struct kernel_param_ops param_ops_taskq_kick = { + .set = param_set_taskq_kick, + .get = param_get_uint, +}; +module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); +#else +module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, + &spl_taskq_kick, 0644); +#endif +MODULE_PARM_DESC(spl_taskq_kick, + "Write nonzero to kick stuck taskqs to spawn more threads"); + +int +spl_taskq_init(void) +{ + init_rwsem(&tq_list_sem); + tsd_create(&taskq_tsd, NULL); + + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_taskq == NULL) + return (1); + + system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_delay_taskq == NULL) { + taskq_destroy(system_taskq); + return (1); + } + + dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + if (dynamic_taskq == NULL) { + taskq_destroy(system_taskq); + taskq_destroy(system_delay_taskq); + return (1); + } + + /* + * This is used to annotate tq_lock, so + * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch + * does not trigger a lockdep warning re: possible recursive locking + */ + dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; + + return (0); +} + +void +spl_taskq_fini(void) +{ + taskq_destroy(dynamic_taskq); + dynamic_taskq = NULL; + + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; + + taskq_destroy(system_taskq); + system_taskq = NULL; + + tsd_destroy(&taskq_tsd); +} diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c new file mode 100644 index 000000000..29de9252a --- /dev/null +++ b/module/os/linux/spl/spl-thread.c @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Thread Implementation. + */ + +#include <sys/thread.h> +#include <sys/kmem.h> +#include <sys/tsd.h> +#include <sys/simd.h> + +/* + * Thread interfaces + */ +typedef struct thread_priv_s { + unsigned long tp_magic; /* Magic */ + int tp_name_size; /* Name size */ + char *tp_name; /* Name (without _thread suffix) */ + void (*tp_func)(void *); /* Registered function */ + void *tp_args; /* Args to be passed to function */ + size_t tp_len; /* Len to be passed to function */ + int tp_state; /* State to start thread at */ + pri_t tp_pri; /* Priority to start threat at */ +} thread_priv_t; + +static int +thread_generic_wrapper(void *arg) +{ + thread_priv_t *tp = (thread_priv_t *)arg; + void (*func)(void *); + void *args; + + ASSERT(tp->tp_magic == TP_MAGIC); + func = tp->tp_func; + args = tp->tp_args; + set_current_state(tp->tp_state); + set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); + kfpu_initialize(); + kmem_free(tp->tp_name, tp->tp_name_size); + kmem_free(tp, sizeof (thread_priv_t)); + + if (func) + func(args); + + return (0); +} + +void +__thread_exit(void) +{ + tsd_exit(); + complete_and_exit(NULL, 0); + /* Unreachable */ +} +EXPORT_SYMBOL(__thread_exit); + +/* + * thread_create() may block forever if it cannot create a thread or + * allocate memory. This is preferable to returning a NULL which Solaris + * style callers likely never check for... since it can't fail. + */ +kthread_t * +__thread_create(caddr_t stk, size_t stksize, thread_func_t func, + const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) +{ + thread_priv_t *tp; + struct task_struct *tsk; + char *p; + + /* Option pp is simply ignored */ + /* Variable stack size unsupported */ + ASSERT(stk == NULL); + + tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); + if (tp == NULL) + return (NULL); + + tp->tp_magic = TP_MAGIC; + tp->tp_name_size = strlen(name) + 1; + + tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); + if (tp->tp_name == NULL) { + kmem_free(tp, sizeof (thread_priv_t)); + return (NULL); + } + + strncpy(tp->tp_name, name, tp->tp_name_size); + + /* + * Strip trailing "_thread" from passed name which will be the func + * name since the exposed API has no parameter for passing a name. + */ + p = strstr(tp->tp_name, "_thread"); + if (p) + p[0] = '\0'; + + tp->tp_func = func; + tp->tp_args = args; + tp->tp_len = len; + tp->tp_state = state; + tp->tp_pri = pri; + + tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, + "%s", tp->tp_name); + if (IS_ERR(tsk)) + return (NULL); + + wake_up_process(tsk); + return ((kthread_t *)tsk); +} +EXPORT_SYMBOL(__thread_create); + +/* + * spl_kthread_create - Wrapper providing pre-3.13 semantics for + * kthread_create() in which it is not killable and less likely + * to return -ENOMEM. + */ +struct task_struct * +spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) +{ + struct task_struct *tsk; + va_list args; + char name[TASK_COMM_LEN]; + + va_start(args, namefmt); + vsnprintf(name, sizeof (name), namefmt, args); + va_end(args); + do { + tsk = kthread_create(func, data, "%s", name); + if (IS_ERR(tsk)) { + if (signal_pending(current)) { + clear_thread_flag(TIF_SIGPENDING); + continue; + } + if (PTR_ERR(tsk) == -ENOMEM) + continue; + return (NULL); + } else { + return (tsk); + } + } while (1); +} +EXPORT_SYMBOL(spl_kthread_create); diff --git a/module/os/linux/spl/spl-tsd.c b/module/os/linux/spl/spl-tsd.c new file mode 100644 index 000000000..14342d5a6 --- /dev/null +++ b/module/os/linux/spl/spl-tsd.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * Solaris Porting Layer (SPL) Thread Specific Data Implementation. + * + * Thread specific data has implemented using a hash table, this avoids + * the need to add a member to the task structure and allows maximum + * portability between kernels. This implementation has been optimized + * to keep the tsd_set() and tsd_get() times as small as possible. + * + * The majority of the entries in the hash table are for specific tsd + * entries. These entries are hashed by the product of their key and + * pid because by design the key and pid are guaranteed to be unique. + * Their product also has the desirable properly that it will be uniformly + * distributed over the hash bins providing neither the pid nor key is zero. + * Under linux the zero pid is always the init process and thus won't be + * used, and this implementation is careful to never to assign a zero key. + * By default the hash table is sized to 512 bins which is expected to + * be sufficient for light to moderate usage of thread specific data. + * + * The hash table contains two additional type of entries. They first + * type is entry is called a 'key' entry and it is added to the hash during + * tsd_create(). It is used to store the address of the destructor function + * and it is used as an anchor point. All tsd entries which use the same + * key will be linked to this entry. This is used during tsd_destroy() to + * quickly call the destructor function for all tsd associated with the key. + * The 'key' entry may be looked up with tsd_hash_search() by passing the + * key you wish to lookup and DTOR_PID constant as the pid. + * + * The second type of entry is called a 'pid' entry and it is added to the + * hash the first time a process set a key. The 'pid' entry is also used + * as an anchor and all tsd for the process will be linked to it. This + * list is using during tsd_exit() to ensure all registered destructors + * are run for the process. The 'pid' entry may be looked up with + * tsd_hash_search() by passing the PID_KEY constant as the key, and + * the process pid. Note that tsd_exit() is called by thread_exit() + * so if your using the Solaris thread API you should not need to call + * tsd_exit() directly. + * + */ + +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/tsd.h> +#include <linux/hash.h> + +typedef struct tsd_hash_bin { + spinlock_t hb_lock; + struct hlist_head hb_head; +} tsd_hash_bin_t; + +typedef struct tsd_hash_table { + spinlock_t ht_lock; + uint_t ht_bits; + uint_t ht_key; + tsd_hash_bin_t *ht_bins; +} tsd_hash_table_t; + +typedef struct tsd_hash_entry { + uint_t he_key; + pid_t he_pid; + dtor_func_t he_dtor; + void *he_value; + struct hlist_node he_list; + struct list_head he_key_list; + struct list_head he_pid_list; +} tsd_hash_entry_t; + +static tsd_hash_table_t *tsd_hash_table = NULL; + + +/* + * tsd_hash_search - searches hash table for tsd_hash_entry + * @table: hash table + * @key: search key + * @pid: search pid + */ +static tsd_hash_entry_t * +tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) +{ + struct hlist_node *node; + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + hlist_for_each(node, &bin->hb_head) { + entry = list_entry(node, tsd_hash_entry_t, he_list); + if ((entry->he_key == key) && (entry->he_pid == pid)) { + spin_unlock(&bin->hb_lock); + return (entry); + } + } + + spin_unlock(&bin->hb_lock); + return (NULL); +} + +/* + * tsd_hash_dtor - call the destructor and free all entries on the list + * @work: list of hash entries + * + * For a list of entries which have all already been removed from the + * hash call their registered destructor then free the associated memory. + */ +static void +tsd_hash_dtor(struct hlist_head *work) +{ + tsd_hash_entry_t *entry; + + while (!hlist_empty(work)) { + entry = hlist_entry(work->first, tsd_hash_entry_t, he_list); + hlist_del(&entry->he_list); + + if (entry->he_dtor && entry->he_pid != DTOR_PID) + entry->he_dtor(entry->he_value); + + kmem_free(entry, sizeof (tsd_hash_entry_t)); + } +} + +/* + * tsd_hash_add - adds an entry to hash table + * @table: hash table + * @key: search key + * @pid: search pid + * + * The caller is responsible for ensuring the unique key/pid do not + * already exist in the hash table. This possible because all entries + * are thread specific thus a concurrent thread will never attempt to + * add this key/pid. Because multiple bins must be checked to add + * links to the dtor and pid entries the entire table is locked. + */ +static int +tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) +{ + tsd_hash_entry_t *entry, *dtor_entry, *pid_entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int rc = 0; + + ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); + + /* New entry allocate structure, set value, and add to hash */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + entry->he_key = key; + entry->he_pid = pid; + entry->he_value = value; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + spin_lock(&table->ht_lock); + + /* Destructor entry must exist for all valid keys */ + dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID); + ASSERT3P(dtor_entry, !=, NULL); + entry->he_dtor = dtor_entry->he_dtor; + + /* Process entry must exist for all valid processes */ + pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid); + ASSERT3P(pid_entry, !=, NULL); + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + /* Add to the hash, key, and pid lists */ + hlist_add_head(&entry->he_list, &bin->hb_head); + list_add(&entry->he_key_list, &dtor_entry->he_key_list); + list_add(&entry->he_pid_list, &pid_entry->he_pid_list); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (rc); +} + +/* + * tsd_hash_add_key - adds a destructor entry to the hash table + * @table: hash table + * @keyp: search key + * @dtor: key destructor + * + * For every unique key there is a single entry in the hash which is used + * as anchor. All other thread specific entries for this key are linked + * to this anchor via the 'he_key_list' list head. On return they keyp + * will be set to the next available key for the hash table. + */ +static int +tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) +{ + tsd_hash_entry_t *tmp_entry, *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int keys_checked = 0; + + ASSERT3P(table, !=, NULL); + + /* Allocate entry to be used as a destructor for this key */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + /* Determine next available key value */ + spin_lock(&table->ht_lock); + do { + /* Limited to TSD_KEYS_MAX concurrent unique keys */ + if (table->ht_key++ > TSD_KEYS_MAX) + table->ht_key = 1; + + /* Ensure failure when all TSD_KEYS_MAX keys are in use */ + if (keys_checked++ >= TSD_KEYS_MAX) { + spin_unlock(&table->ht_lock); + return (ENOENT); + } + + tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID); + } while (tmp_entry); + + /* Add destructor entry in to hash table */ + entry->he_key = *keyp = table->ht_key; + entry->he_pid = DTOR_PID; + entry->he_dtor = dtor; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_add_pid - adds a process entry to the hash table + * @table: hash table + * @pid: search pid + * + * For every process there is a single entry in the hash which is used + * as anchor. All other thread specific entries for this process are + * linked to this anchor via the 'he_pid_list' list head. + */ +static int +tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid) +{ + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + /* Allocate entry to be used as the process reference */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + spin_lock(&table->ht_lock); + entry->he_key = PID_KEY; + entry->he_pid = pid; + entry->he_dtor = NULL; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_del - delete an entry from hash table, key, and pid lists + * @table: hash table + * @key: search key + * @pid: search pid + */ +static void +tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry) +{ + hlist_del(&entry->he_list); + list_del_init(&entry->he_key_list); + list_del_init(&entry->he_pid_list); +} + +/* + * tsd_hash_table_init - allocate a hash table + * @bits: hash table size + * + * A hash table with 2^bits bins will be created, it may not be resized + * after the fact and must be free'd with tsd_hash_table_fini(). + */ +static tsd_hash_table_t * +tsd_hash_table_init(uint_t bits) +{ + tsd_hash_table_t *table; + int hash, size = (1 << bits); + + table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP); + if (table == NULL) + return (NULL); + + table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP); + if (table->ht_bins == NULL) { + kmem_free(table, sizeof (tsd_hash_table_t)); + return (NULL); + } + + for (hash = 0; hash < size; hash++) { + spin_lock_init(&table->ht_bins[hash].hb_lock); + INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head); + } + + spin_lock_init(&table->ht_lock); + table->ht_bits = bits; + table->ht_key = 1; + + return (table); +} + +/* + * tsd_hash_table_fini - free a hash table + * @table: hash table + * + * Free a hash table allocated by tsd_hash_table_init(). If the hash + * table is not empty this function will call the proper destructor for + * all remaining entries before freeing the memory used by those entries. + */ +static void +tsd_hash_table_fini(tsd_hash_table_t *table) +{ + HLIST_HEAD(work); + tsd_hash_bin_t *bin; + tsd_hash_entry_t *entry; + int size, i; + + ASSERT3P(table, !=, NULL); + spin_lock(&table->ht_lock); + for (i = 0, size = (1 << table->ht_bits); i < size; i++) { + bin = &table->ht_bins[i]; + spin_lock(&bin->hb_lock); + while (!hlist_empty(&bin->hb_head)) { + entry = hlist_entry(bin->hb_head.first, + tsd_hash_entry_t, he_list); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + } + spin_unlock(&bin->hb_lock); + } + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits)); + kmem_free(table, sizeof (tsd_hash_table_t)); +} + +/* + * tsd_remove_entry - remove a tsd entry for this thread + * @entry: entry to remove + * + * Remove the thread specific data @entry for this thread. + * If this is the last entry for this thread, also remove the PID entry. + */ +static void +tsd_remove_entry(tsd_hash_entry_t *entry) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + ASSERT3P(entry, !=, NULL); + + spin_lock(&table->ht_lock); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + /* save the possible pid_entry */ + pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t, + he_pid_list); + + /* remove entry */ + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + + /* if pid_entry is indeed pid_entry, then remove it if it's empty */ + if (pid_entry->he_key == PID_KEY && + list_empty(&pid_entry->he_pid_list)) { + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + } + + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + */ +int +tsd_set(uint_t key, void *value) +{ + tsd_hash_table_t *table; + tsd_hash_entry_t *entry; + pid_t pid; + int rc; + /* mark remove if value is NULL */ + boolean_t remove = (value == NULL); + + table = tsd_hash_table; + pid = curthread->pid; + ASSERT3P(table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (EINVAL); + + /* Entry already exists in hash table update value */ + entry = tsd_hash_search(table, key, pid); + if (entry) { + entry->he_value = value; + /* remove the entry */ + if (remove) + tsd_remove_entry(entry); + return (0); + } + + /* don't create entry if value is NULL */ + if (remove) + return (0); + + /* Add a process entry to the hash if not yet exists */ + entry = tsd_hash_search(table, PID_KEY, pid); + if (entry == NULL) { + rc = tsd_hash_add_pid(table, pid); + if (rc) + return (rc); + } + + rc = tsd_hash_add(table, key, pid, value); + return (rc); +} +EXPORT_SYMBOL(tsd_set); + +/* + * tsd_get - get thread specific data + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get(uint_t key) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, curthread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get); + +/* + * tsd_get_by_thread - get thread specific data for specified thread + * @key: lookup key + * @thread: thread to lookup + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, kthread_t *thread) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, thread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get_by_thread); + +/* + * tsd_create - create thread specific data key + * @keyp: lookup key address + * @dtor: destructor called during tsd_destroy() or tsd_exit() + * + * Provided key must be set to 0 or it assumed to be already in use. + * The dtor is allowed to be NULL in which case no additional cleanup + * for the data is performed during tsd_destroy() or tsd_exit(). + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + ASSERT3P(keyp, !=, NULL); + if (*keyp) + return; + + (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor); +} +EXPORT_SYMBOL(tsd_create); + +/* + * tsd_destroy - destroy thread specific data + * @keyp: lookup key address + * + * Destroys the thread specific data on all threads which use this key. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_destroy(uint_t *keyp) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *dtor_entry, *entry; + tsd_hash_bin_t *dtor_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID); + if (dtor_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All threads which use this key must be linked off of the + * DTOR_PID entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + while (!list_empty(&dtor_entry->he_key_list)) { + entry = list_entry(dtor_entry->he_key_list.next, + tsd_hash_entry_t, he_key_list); + ASSERT3U(dtor_entry->he_key, ==, entry->he_key); + ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)dtor_entry->he_key * + (ulong_t)dtor_entry->he_pid, table->ht_bits); + dtor_entry_bin = &table->ht_bins[hash]; + + spin_lock(&dtor_entry_bin->hb_lock); + tsd_hash_del(table, dtor_entry); + hlist_add_head(&dtor_entry->he_list, &work); + spin_unlock(&dtor_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + *keyp = 0; +} +EXPORT_SYMBOL(tsd_destroy); + +/* + * tsd_exit - destroys all thread specific data for this thread + * + * Destroys all the thread specific data for this thread. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_exit(void) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry, *entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid); + if (pid_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All keys associated with this pid must be linked off of the + * PID_KEY entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + + while (!list_empty(&pid_entry->he_pid_list)) { + entry = list_entry(pid_entry->he_pid_list.next, + tsd_hash_entry_t, he_pid_list); + ASSERT3U(pid_entry->he_pid, ==, entry->he_pid); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} +EXPORT_SYMBOL(tsd_exit); + +int +spl_tsd_init(void) +{ + tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); + if (tsd_hash_table == NULL) + return (1); + + return (0); +} + +void +spl_tsd_fini(void) +{ + tsd_hash_table_fini(tsd_hash_table); + tsd_hash_table = NULL; +} diff --git a/module/os/linux/spl/spl-vmem.c b/module/os/linux/spl/spl-vmem.c new file mode 100644 index 000000000..e1a84a911 --- /dev/null +++ b/module/os/linux/spl/spl-vmem.c @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <sys/debug.h> +#include <sys/vmem.h> +#include <sys/kmem_cache.h> +#include <sys/shrinker.h> +#include <linux/module.h> + +vmem_t *heap_arena = NULL; +EXPORT_SYMBOL(heap_arena); + +vmem_t *zio_alloc_arena = NULL; +EXPORT_SYMBOL(zio_alloc_arena); + +vmem_t *zio_arena = NULL; +EXPORT_SYMBOL(zio_arena); + +#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */ + +/* + * Return approximate virtual memory usage based on these assumptions: + * + * 1) The major SPL consumer of virtual memory is the kmem cache. + * 2) Memory allocated with vmem_alloc() is short lived and can be ignored. + * 3) Allow a 4MB floor as a generous pad given normal consumption. + * 4) The spl_kmem_cache_sem only contends with cache create/destroy. + */ +size_t +vmem_size(vmem_t *vmp, int typemask) +{ + spl_kmem_cache_t *skc; + size_t alloc = VMEM_FLOOR_SIZE; + + if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE)) + return (VMALLOC_TOTAL); + + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + if (skc->skc_flags & KMC_VMEM) + alloc += skc->skc_slab_size * skc->skc_slab_total; + } + up_read(&spl_kmem_cache_sem); + + if (typemask & VMEM_ALLOC) + return (MIN(alloc, VMALLOC_TOTAL)); + else if (typemask & VMEM_FREE) + return (MAX(VMALLOC_TOTAL - alloc, 0)); + else + return (0); +} +EXPORT_SYMBOL(vmem_size); + +/* + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. + */ +void * +spl_vmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_VMEM; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_alloc); + +void * +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= (KM_VMEM | KM_ZERO); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_zalloc); + +void +spl_vmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_vmem_free); + +int +spl_vmem_init(void) +{ + return (0); +} + +void +spl_vmem_fini(void) +{ +} diff --git a/module/os/linux/spl/spl-vnode.c b/module/os/linux/spl/spl-vnode.c new file mode 100644 index 000000000..d9056c964 --- /dev/null +++ b/module/os/linux/spl/spl-vnode.c @@ -0,0 +1,719 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) Vnode Implementation. + */ + +#include <sys/cred.h> +#include <sys/vnode.h> +#include <sys/kmem_cache.h> +#include <linux/falloc.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#ifdef HAVE_FDTABLE_HEADER +#include <linux/fdtable.h> +#endif + +vnode_t *rootdir = (vnode_t *)0xabcd1234; +EXPORT_SYMBOL(rootdir); + +static spl_kmem_cache_t *vn_cache; +static spl_kmem_cache_t *vn_file_cache; + +static spinlock_t vn_file_lock; +static LIST_HEAD(vn_file_list); + +static int +spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len) +{ + int error = -EOPNOTSUPP; + +#ifdef HAVE_FILE_FALLOCATE + if (fp->f_op->fallocate) + error = fp->f_op->fallocate(fp, mode, offset, len); +#else +#ifdef HAVE_INODE_FALLOCATE + if (fp->f_dentry && fp->f_dentry->d_inode && + fp->f_dentry->d_inode->i_op->fallocate) + error = fp->f_dentry->d_inode->i_op->fallocate( + fp->f_dentry->d_inode, mode, offset, len); +#endif /* HAVE_INODE_FALLOCATE */ +#endif /* HAVE_FILE_FALLOCATE */ + + return (error); +} + +static int +spl_filp_fsync(struct file *fp, int sync) +{ +#ifdef HAVE_2ARGS_VFS_FSYNC + return (vfs_fsync(fp, sync)); +#else + return (vfs_fsync(fp, (fp)->f_dentry, sync)); +#endif /* HAVE_2ARGS_VFS_FSYNC */ +} + +static ssize_t +spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_WRITE_PPOS) + return (kernel_write(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + ret = vfs_write(file, (__force const char __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +static ssize_t +spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + ret = vfs_read(file, (void __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +vtype_t +vn_mode_to_vtype(mode_t mode) +{ + if (S_ISREG(mode)) + return (VREG); + + if (S_ISDIR(mode)) + return (VDIR); + + if (S_ISCHR(mode)) + return (VCHR); + + if (S_ISBLK(mode)) + return (VBLK); + + if (S_ISFIFO(mode)) + return (VFIFO); + + if (S_ISLNK(mode)) + return (VLNK); + + if (S_ISSOCK(mode)) + return (VSOCK); + + return (VNON); +} /* vn_mode_to_vtype() */ +EXPORT_SYMBOL(vn_mode_to_vtype); + +mode_t +vn_vtype_to_mode(vtype_t vtype) +{ + if (vtype == VREG) + return (S_IFREG); + + if (vtype == VDIR) + return (S_IFDIR); + + if (vtype == VCHR) + return (S_IFCHR); + + if (vtype == VBLK) + return (S_IFBLK); + + if (vtype == VFIFO) + return (S_IFIFO); + + if (vtype == VLNK) + return (S_IFLNK); + + if (vtype == VSOCK) + return (S_IFSOCK); + + return (VNON); +} /* vn_vtype_to_mode() */ +EXPORT_SYMBOL(vn_vtype_to_mode); + +vnode_t * +vn_alloc(int flag) +{ + vnode_t *vp; + + vp = kmem_cache_alloc(vn_cache, flag); + if (vp != NULL) { + vp->v_file = NULL; + vp->v_type = 0; + } + + return (vp); +} /* vn_alloc() */ +EXPORT_SYMBOL(vn_alloc); + +void +vn_free(vnode_t *vp) +{ + kmem_cache_free(vn_cache, vp); +} /* vn_free() */ +EXPORT_SYMBOL(vn_free); + +int +vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp, + int x1, void *x2) +{ + struct file *fp; + struct kstat stat; + int rc, saved_umask = 0; + gfp_t saved_gfp; + vnode_t *vp; + + ASSERT(flags & (FWRITE | FREAD)); + ASSERT(seg == UIO_SYSSPACE); + ASSERT(vpp); + *vpp = NULL; + + if (!(flags & FCREAT) && (flags & FWRITE)) + flags |= FEXCL; + + /* + * Note for filp_open() the two low bits must be remapped to mean: + * 01 - read-only -> 00 read-only + * 10 - write-only -> 01 write-only + * 11 - read-write -> 10 read-write + */ + flags--; + + if (flags & FCREAT) + saved_umask = xchg(¤t->fs->umask, 0); + + fp = filp_open(path, flags, mode); + + if (flags & FCREAT) + (void) xchg(¤t->fs->umask, saved_umask); + + if (IS_ERR(fp)) + return (-PTR_ERR(fp)); + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat); +#else + rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); +#endif + if (rc) { + filp_close(fp, 0); + return (-rc); + } + + vp = vn_alloc(KM_SLEEP); + if (!vp) { + filp_close(fp, 0); + return (ENOMEM); + } + + saved_gfp = mapping_gfp_mask(fp->f_mapping); + mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS)); + + mutex_enter(&vp->v_lock); + vp->v_type = vn_mode_to_vtype(stat.mode); + vp->v_file = fp; + vp->v_gfp_mask = saved_gfp; + *vpp = vp; + mutex_exit(&vp->v_lock); + + return (0); +} /* vn_open() */ +EXPORT_SYMBOL(vn_open); + +int +vn_openat(const char *path, uio_seg_t seg, int flags, int mode, + vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd) +{ + char *realpath; + int len, rc; + + ASSERT(vp == rootdir); + + len = strlen(path) + 2; + realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP)); + if (!realpath) + return (ENOMEM); + + (void) snprintf(realpath, len, "/%s", path); + rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2); + kfree(realpath); + + return (rc); +} /* vn_openat() */ +EXPORT_SYMBOL(vn_openat); + +int +vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off, + uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp) +{ + struct file *fp = vp->v_file; + loff_t offset = off; + int rc; + + ASSERT(uio == UIO_WRITE || uio == UIO_READ); + ASSERT(seg == UIO_SYSSPACE); + ASSERT((ioflag & ~FAPPEND) == 0); + + if (ioflag & FAPPEND) + offset = fp->f_pos; + + if (uio & UIO_WRITE) + rc = spl_kernel_write(fp, addr, len, &offset); + else + rc = spl_kernel_read(fp, addr, len, &offset); + + fp->f_pos = offset; + + if (rc < 0) + return (-rc); + + if (residp) { + *residp = len - rc; + } else { + if (rc != len) + return (EIO); + } + + return (0); +} /* vn_rdwr() */ +EXPORT_SYMBOL(vn_rdwr); + +int +vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4) +{ + int rc; + + ASSERT(vp); + ASSERT(vp->v_file); + + mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask); + rc = filp_close(vp->v_file, 0); + vn_free(vp); + + return (-rc); +} /* vn_close() */ +EXPORT_SYMBOL(vn_close); + +/* + * vn_seek() does not actually seek it only performs bounds checking on the + * proposed seek. We perform minimal checking and allow vn_rdwr() to catch + * anything more serious. + */ +int +vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct) +{ + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} +EXPORT_SYMBOL(vn_seek); + +int +vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4) +{ + struct file *fp; + struct kstat stat; + int rc; + + ASSERT(vp); + ASSERT(vp->v_file); + ASSERT(vap); + + fp = vp->v_file; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&fp->f_path, &stat); +#else + rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat); +#endif + if (rc) + return (-rc); + + vap->va_type = vn_mode_to_vtype(stat.mode); + vap->va_mode = stat.mode; + vap->va_uid = KUID_TO_SUID(stat.uid); + vap->va_gid = KGID_TO_SGID(stat.gid); + vap->va_fsid = 0; + vap->va_nodeid = stat.ino; + vap->va_nlink = stat.nlink; + vap->va_size = stat.size; + vap->va_blksize = stat.blksize; + vap->va_atime = stat.atime; + vap->va_mtime = stat.mtime; + vap->va_ctime = stat.ctime; + vap->va_rdev = stat.rdev; + vap->va_nblocks = stat.blocks; + + return (0); +} +EXPORT_SYMBOL(vn_getattr); + +int +vn_fsync(vnode_t *vp, int flags, void *x3, void *x4) +{ + int datasync = 0; + int error; + int fstrans; + + ASSERT(vp); + ASSERT(vp->v_file); + + if (flags & FDSYNC) + datasync = 1; + + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + error = -spl_filp_fsync(vp->v_file, datasync); + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} /* vn_fsync() */ +EXPORT_SYMBOL(vn_fsync); + +int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag, + offset_t offset, void *x6, void *x7) +{ + int error = EOPNOTSUPP; +#ifdef FALLOC_FL_PUNCH_HOLE + int fstrans; +#endif + + if (cmd != F_FREESP || bfp->l_whence != SEEK_SET) + return (EOPNOTSUPP); + + ASSERT(vp); + ASSERT(vp->v_file); + ASSERT(bfp->l_start >= 0 && bfp->l_len > 0); + +#ifdef FALLOC_FL_PUNCH_HOLE + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + /* + * When supported by the underlying file system preferentially + * use the fallocate() callback to preallocate the space. + */ + error = -spl_filp_fallocate(vp->v_file, + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + bfp->l_start, bfp->l_len); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + if (error == 0) + return (0); +#endif + +#ifdef HAVE_INODE_TRUNCATE_RANGE + if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode && + vp->v_file->f_dentry->d_inode->i_op && + vp->v_file->f_dentry->d_inode->i_op->truncate_range) { + off_t end = bfp->l_start + bfp->l_len; + /* + * Judging from the code in shmem_truncate_range(), + * it seems the kernel expects the end offset to be + * inclusive and aligned to the end of a page. + */ + if (end % PAGE_SIZE != 0) { + end &= ~(off_t)(PAGE_SIZE - 1); + if (end <= bfp->l_start) + return (0); + } + --end; + + vp->v_file->f_dentry->d_inode->i_op->truncate_range( + vp->v_file->f_dentry->d_inode, bfp->l_start, end); + + return (0); + } +#endif + + return (error); +} +EXPORT_SYMBOL(vn_space); + +/* Function must be called while holding the vn_file_lock */ +static file_t * +file_find(int fd, struct task_struct *task) +{ + file_t *fp; + + list_for_each_entry(fp, &vn_file_list, f_list) { + if (fd == fp->f_fd && fp->f_task == task) { + ASSERT(atomic_read(&fp->f_ref) != 0); + return (fp); + } + } + + return (NULL); +} /* file_find() */ + +file_t * +vn_getf(int fd) +{ + struct kstat stat; + struct file *lfp; + file_t *fp; + vnode_t *vp; + int rc = 0; + + if (fd < 0) + return (NULL); + + /* Already open just take an extra reference */ + spin_lock(&vn_file_lock); + + fp = file_find(fd, current); + if (fp) { + lfp = fget(fd); + fput(fp->f_file); + /* + * areleasef() can cause us to see a stale reference when + * userspace has reused a file descriptor before areleasef() + * has run. fput() the stale reference and replace it. We + * retain the original reference count such that the concurrent + * areleasef() will decrement its reference and terminate. + */ + if (lfp != fp->f_file) { + fp->f_file = lfp; + fp->f_vnode->v_file = lfp; + } + atomic_inc(&fp->f_ref); + spin_unlock(&vn_file_lock); + return (fp); + } + + spin_unlock(&vn_file_lock); + + /* File was not yet opened create the object and setup */ + fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP); + if (fp == NULL) + goto out; + + mutex_enter(&fp->f_lock); + + fp->f_fd = fd; + fp->f_task = current; + fp->f_offset = 0; + atomic_inc(&fp->f_ref); + + lfp = fget(fd); + if (lfp == NULL) + goto out_mutex; + + vp = vn_alloc(KM_SLEEP); + if (vp == NULL) + goto out_fget; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&lfp->f_path, &stat); +#else + rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat); +#endif + if (rc) + goto out_vnode; + + mutex_enter(&vp->v_lock); + vp->v_type = vn_mode_to_vtype(stat.mode); + vp->v_file = lfp; + mutex_exit(&vp->v_lock); + + fp->f_vnode = vp; + fp->f_file = lfp; + + /* Put it on the tracking list */ + spin_lock(&vn_file_lock); + list_add(&fp->f_list, &vn_file_list); + spin_unlock(&vn_file_lock); + + mutex_exit(&fp->f_lock); + return (fp); + +out_vnode: + vn_free(vp); +out_fget: + fput(lfp); +out_mutex: + mutex_exit(&fp->f_lock); + kmem_cache_free(vn_file_cache, fp); +out: + return (NULL); +} /* getf() */ +EXPORT_SYMBOL(getf); + +static void releasef_locked(file_t *fp) +{ + ASSERT(fp->f_file); + ASSERT(fp->f_vnode); + + /* Unlinked from list, no refs, safe to free outside mutex */ + fput(fp->f_file); + vn_free(fp->f_vnode); + + kmem_cache_free(vn_file_cache, fp); +} + +void +vn_releasef(int fd) +{ + areleasef(fd, P_FINFO(current)); +} +EXPORT_SYMBOL(releasef); + +void +vn_areleasef(int fd, uf_info_t *fip) +{ + file_t *fp; + struct task_struct *task = (struct task_struct *)fip; + + if (fd < 0) + return; + + spin_lock(&vn_file_lock); + fp = file_find(fd, task); + if (fp) { + atomic_dec(&fp->f_ref); + if (atomic_read(&fp->f_ref) > 0) { + spin_unlock(&vn_file_lock); + return; + } + + list_del(&fp->f_list); + releasef_locked(fp); + } + spin_unlock(&vn_file_lock); +} /* releasef() */ +EXPORT_SYMBOL(areleasef); + +static int +vn_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + struct vnode *vp = buf; + + mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} /* vn_cache_constructor() */ + +static void +vn_cache_destructor(void *buf, void *cdrarg) +{ + struct vnode *vp = buf; + + mutex_destroy(&vp->v_lock); +} /* vn_cache_destructor() */ + +static int +vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + file_t *fp = buf; + + atomic_set(&fp->f_ref, 0); + mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&fp->f_list); + + return (0); +} /* vn_file_cache_constructor() */ + +static void +vn_file_cache_destructor(void *buf, void *cdrarg) +{ + file_t *fp = buf; + + mutex_destroy(&fp->f_lock); +} /* vn_file_cache_destructor() */ + +int +spl_vn_init(void) +{ + spin_lock_init(&vn_file_lock); + + vn_cache = kmem_cache_create("spl_vn_cache", + sizeof (struct vnode), 64, vn_cache_constructor, + vn_cache_destructor, NULL, NULL, NULL, 0); + + vn_file_cache = kmem_cache_create("spl_vn_file_cache", + sizeof (file_t), 64, vn_file_cache_constructor, + vn_file_cache_destructor, NULL, NULL, NULL, 0); + + return (0); +} /* spl_vn_init() */ + +void +spl_vn_fini(void) +{ + file_t *fp, *next_fp; + int leaked = 0; + + spin_lock(&vn_file_lock); + + list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) { + list_del(&fp->f_list); + releasef_locked(fp); + leaked++; + } + + spin_unlock(&vn_file_lock); + + if (leaked > 0) + printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked); + + kmem_cache_destroy(vn_file_cache); + kmem_cache_destroy(vn_cache); +} /* spl_vn_fini() */ diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c new file mode 100644 index 000000000..1dd31ffc1 --- /dev/null +++ b/module/os/linux/spl/spl-xdr.c @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2008-2010 Sun Microsystems, Inc. + * Written by Ricardo Correia <[email protected]> + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * Solaris Porting Layer (SPL) XDR Implementation. + */ + +#include <linux/string.h> +#include <sys/kmem.h> +#include <sys/debug.h> +#include <sys/types.h> +#include <sys/sysmacros.h> +#include <rpc/xdr.h> + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; + + if (req != XDR_GET_BYTES_AVAIL) + return (FALSE); + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; + + return (TRUE); +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return (FALSE); + + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr)); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + val = *((unsigned char *) cp); + + return (xdrmem_enc_uint32(xdrs, val)); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return (FALSE); + + *((unsigned char *) cp) = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + return (xdrmem_enc_uint32(xdrs, *usp)); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return (FALSE); + + *usp = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_enc_uint32(xdrs, *up)); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return (FALSE); + + return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return (FALSE); + if (!xdrmem_dec_uint32(xdrs, &low)) + return (FALSE); + + *ullp = ((u_longlong_t)high << 32) | low; + + return (TRUE); +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return (FALSE); + + if (!xdrmem_enc_uint(xdrs, sizep)) + return (FALSE); + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return (FALSE); + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return (FALSE); + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return (FALSE); + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return (FALSE); + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return (FALSE); + } + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return (FALSE); + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return (FALSE); + + return (xdrmem_enc_bytes(xdrs, *sp, len)); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return (FALSE); + + if (size > maxsize || size > UINT_MAX - 1) + return (FALSE); + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return (FALSE); + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (memchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return (TRUE); + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return (FALSE); +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/os/linux/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c new file mode 100644 index 000000000..62423343c --- /dev/null +++ b/module/os/linux/spl/spl-zlib.c @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf <[email protected]>. + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see <http://zfsonlinux.org/>. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see <http://www.gnu.org/licenses/>. + * + * + * z_compress_level/z_uncompress are nearly identical copies of the + * compress2/uncompress functions provided by the official zlib package + * available at http://zlib.net/. The only changes made we to slightly + * adapt the functions called to match the linux kernel implementation + * of zlib. The full zlib license follows: + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + + +#include <sys/kmem.h> +#include <sys/kmem_cache.h> +#include <sys/zmod.h> + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM); + if (!zlib_workspace_cache) + return (1); + + return (0); +} + +void +spl_zlib_fini(void) +{ + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; +} diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in new file mode 100644 index 000000000..84900bd2c --- /dev/null +++ b/module/os/linux/zfs/Makefile.in @@ -0,0 +1,34 @@ +# +# Linux specific sources included from module/zfs/Makefile.in +# + +# Suppress unused-value warnings in sparc64 architecture headers +ifeq ($(target_cpu),sparc64) +ccflags-y += -Wno-unused-value +endif + +ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs + +$(MODULE)-objs += ../os/linux/zfs/abd.o +$(MODULE)-objs += ../os/linux/zfs/policy.o +$(MODULE)-objs += ../os/linux/zfs/qat.o +$(MODULE)-objs += ../os/linux/zfs/qat_compress.o +$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o +$(MODULE)-objs += ../os/linux/zfs/spa_stats.o +$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o +$(MODULE)-objs += ../os/linux/zfs/vdev_file.o +$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o +$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o +$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o +$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o +$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o +$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o +$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o +$(MODULE)-objs += ../os/linux/zfs/zpl_export.o +$(MODULE)-objs += ../os/linux/zfs/zpl_file.o +$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o +$(MODULE)-objs += ../os/linux/zfs/zpl_super.o +$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o diff --git a/module/os/linux/zfs/abd.c b/module/os/linux/zfs/abd.c new file mode 100644 index 000000000..ac6b0b742 --- /dev/null +++ b/module/os/linux/zfs/abd.c @@ -0,0 +1,1638 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we are using HIGHMEM (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) If we are not using HIGHMEM, then all physical memory is always + * mapped into the kernel's address space, so we also avoid the map / + * unmap costs on each ABD access. + * + * If we are not using HIGHMEM, scattered buffers which have only one chunk + * can be treated as linear buffers, because they are contiguous in the + * kernel's virtual address space. See abd_alloc_pages() for details. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include <sys/abd.h> +#include <sys/param.h> +#include <sys/zio.h> +#include <sys/zfs_context.h> +#include <sys/zfs_znode.h> +#ifdef _KERNEL +#include <linux/scatterlist.h> +#include <linux/kmap_compat.h> +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +/* see block comment above for description */ +int zfs_abd_scatter_enabled = B_TRUE; +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +/* + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's. Smaller allocations will use linear ABD's which uses + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. This can + * improve memory efficiency, but it also makes it much harder for ARC + * evictions to actually free pages, because all the buffers on one slab need + * to be freed in order for the slab (and underlying pages) to be freed. + * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's + * possible for them to actually waste more memory than scatter (one page per + * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). + * + * Spill blocks are typically 512B and are heavily used on systems running + * selinux with the default dnode size and the `xattr=sa` property set. + * + * By default we use linear allocations for 512B and 1KB, and scatter + * allocations for larger (1.5KB and up). + */ +int zfs_abd_scatter_min_size = 512 * 3; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +#ifdef _KERNEL +/* + * Mark zfs data pages so they can be excluded from kernel crash dumps + */ +#ifdef _LP64 +#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E + +static inline void +abd_mark_zfs_page(struct page *page) +{ + get_page(page); + SetPagePrivate(page); + set_page_private(page, ABD_FILE_CACHE_PAGE); +} + +static inline void +abd_unmark_zfs_page(struct page *page) +{ + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#else +#define abd_mark_zfs_page(page) +#define abd_unmark_zfs_page(page) +#endif /* _LP64 */ + +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page = NULL; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned chunk_pages; + int order; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); + if (page == NULL) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + abd_mark_zfs_page(page); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + /* + * These conditions ensure that a possible transformation to a linear + * ABD would be valid. + */ + ASSERT(!PageHighMem(sg_page(table.sgl))); + ASSERT0(ABD_SCATTER(abd).abd_offset); + + if (table.nents == 1) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's can be + * represented this way. Some multi-page ABD's can also be + * represented this way, if we were able to allocate a single + * "chunk" (higher-order "page" which represents a power-of-2 + * series of physically-contiguous pages). This is often the + * case for 2-page (8K) ABD's. + * + * Representing a single-entry scatter ABD as a linear ABD + * has the performance advantage of avoiding the copy (and + * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. + * A performance increase of around 5% has been observed for + * ARC-cached reads (of small blocks which can take advantage + * of this). + * + * Note that this optimization is only possible because the + * pages are always mapped into the kernel's address space. + * This is not the case for highmem pages, so the + * optimization can not be made there. + */ + abd->abd_flags |= ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + abd->abd_u.abd_linear.abd_buf = + page_address(sg_page(table.sgl)); + } else if (table.nents > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + } +} +#else +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i = 0; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + abd_mark_zfs_page(page); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +static void +abd_free_pages(abd_t *abd) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i = 0; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = nr_pages; + sg_free_table(&table); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +struct page; + +#define zfs_kmap_atomic(chunk, km) ((void *)chunk) +#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) +{ + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +static void +abd_alloc_pages(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +static void +abd_free_pages(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + + abd_for_each_sg(abd, sg, n, i) { + for (int j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); + umem_free(p, PAGESIZE); + } + } + + vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); +} + +#endif /* _KERNEL */ + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + size_t n; + int i = 0; + struct scatterlist *sg = NULL; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(void) +{ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + /* see the comment above zfs_abd_scatter_min_size */ + if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd_t *abd = abd_alloc_struct(); + abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; + abd_alloc_pages(abd, size); + + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + P2ROUNDUP(size, PAGESIZE) - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + abd_free_pages(abd); + + zfs_refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd_is_linear_page(abd)) { + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; + abd_free_scatter(abd); + return; + } + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + zfs_refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + int i = 0; + struct scatterlist *sg = NULL; + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + + abd = abd_alloc_struct(); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + } + + abd->abd_size = size; + abd->abd_parent = sabd; + zfs_refcount_create(&abd->abd_children); + (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + + VERIFY3U(size, >, 0); + + return (abd_get_offset_impl(sabd, off, size)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + + /* + * abd_free() needs to handle LINEAR_PAGE ABD's specially. + * Since that flag does not survive the + * abd_release_ownership_of_buf() -> abd_get_from_buf() -> + * abd_take_ownership_of_buf() sequence, we don't allow releasing + * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. + */ + ASSERT(!abd_is_linear_page(abd)); + + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +#ifndef HAVE_1ARG_KMAP_ATOMIC +#define NR_KM_TYPE (6) +#ifdef _KERNEL +int km_table[NR_KM_TYPE] = { + KM_USER0, + KM_USER1, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, + KM_PTE0, + KM_PTE1, +}; +#endif +#endif + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +#ifndef HAVE_1ARG_KMAP_ATOMIC + int iter_km; /* KM_* for kmap_atomic */ +#endif +}; + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +#ifndef HAVE_1ARG_KMAP_ATOMIC + ASSERT3U(km_type, <, NR_KM_TYPE); + aiter->iter_km = km_type; +#endif +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), + km_table[aiter->iter_km]); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, + km_table[aiter->iter_km]); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd, 0); + abd_iter_advance(&aiter, off); + + while (size > 0) { + abd_iter_map(&aiter); + + size_t len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd, 0); + abd_iter_init(&saiter, sabd, 1); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + size_t dlen = MIN(daiter.iter_mapsize, size); + size_t slen = MIN(saiter.iter_mapsize, size); + size_t len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + unsigned long flags; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i], i); + + if (dabd) + abd_iter_init(&daiter, dabd, i); + + ASSERT3S(dsize, >=, 0); + + local_irq_save(flags); + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } + local_irq_restore(flags); +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + unsigned long flags; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i], 2*i); + abd_iter_init(&xiters[i], tabds[i], 2*i+1); + } + + local_irq_save(flags); + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } + local_irq_restore(flags); +} + +#if defined(_KERNEL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = abd->abd_u.abd_scatter.abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT(!abd_is_linear(abd)); + ASSERT3U(io_size, <=, abd->abd_size - off); + + abd_iter_init(&aiter, abd, 0); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +module_param(zfs_abd_scatter_min_size, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_min_size, + "Minimum size of scatter allocations."); +/* CSTYLED */ +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c new file mode 100644 index 000000000..7f9456a67 --- /dev/null +++ b/module/os/linux/zfs/policy.c @@ -0,0 +1,355 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright (C) 2016 Lawrence Livermore National Security, LLC. + * + * For Linux the vast majority of this enforcement is already handled via + * the standard Linux VFS permission checks. However certain administrative + * commands which bypass the standard mechanisms may need to make use of + * this functionality. + */ + +#include <sys/policy.h> +#include <linux/security.h> +#include <linux/vfs_compat.h> + +/* + * The passed credentials cannot be directly verified because Linux only + * provides and interface to check the *current* process credentials. In + * order to handle this the capable() test is only run when the passed + * credentials match the current process credentials or the kcred. In + * all other cases this function must fail and return the passed err. + */ +static int +priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err, + struct user_namespace *ns) +{ + ASSERT3S(all, ==, B_FALSE); + + if (cr != CRED() && (cr != kcred)) + return (err); + +#if defined(CONFIG_USER_NS) && defined(HAVE_NS_CAPABLE) + if (!(ns ? ns_capable(ns, capability) : capable(capability))) +#else + if (!capable(capability)) +#endif + return (err); + + return (0); +} + +static int +priv_policy(const cred_t *cr, int capability, boolean_t all, int err) +{ + return (priv_policy_ns(cr, capability, all, err, NULL)); +} + +static int +priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err) +{ + /* + * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() + * checks. If we cannot do them, we shouldn't be using ns_capable() + * since we don't know whether the affected files are valid in our + * namespace. Note that kuid_has_mapping() came after cred->user_ns, so + * we shouldn't need to re-check for HAVE_CRED_USER_NS + */ +#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) + return (priv_policy_ns(cr, capability, all, err, cr->user_ns)); +#else + return (priv_policy_ns(cr, capability, all, err, NULL)); +#endif +} + +/* + * Checks for operations that are either client-only or are used by + * both clients and servers. + */ +int +secpolicy_nfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); +} + +/* + * Catch all system configuration. + */ +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM)); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + return (0); +} + +/* + * This is a special routine for ZFS; it is used to determine whether + * any of the privileges in effect allow any form of access to the + * file. There's no reason to audit this or any reason to record + * this. More work is needed to do the "KPLD" stuff. + */ +int +secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + + if (zpl_inode_owner_or_capable(ip)) + return (0); + +#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0) + return (0); + + if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0) + return (0); + + return (EPERM); +} + +/* + * Determine if subject can chown owner of a file. + */ +int +secpolicy_vnode_chown(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Determine if subject can change group ownership of a file. + */ +int +secpolicy_vnode_create_gid(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM)); +} + +/* + * Policy determines whether we can remove an entry from a directory, + * regardless of permission bits. + */ +int +secpolicy_vnode_remove(const cred_t *cr) +{ + return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Determine that subject can modify the mode of a file. allzone privilege + * needed when modifying root owned object. + */ +int +secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM)); +} + +/* + * Are we allowed to retain the set-uid/set-gid bits when + * changing ownership or when writing to a file? + * "issuid" should be true when set-uid; only in that case + * root ownership is checked (setgid is assumed). + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +{ + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); +} + +/* + * Determine that subject can set the file setgid flag. + */ +int +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) +{ +#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) + if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) + return (EPERM); +#endif + if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); + + return (0); +} + +/* + * Determine if the subject can inject faults in the ZFS fault injection + * framework. Requires all privileges. + */ +int +secpolicy_zinject(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); +} + +/* + * Determine if the subject has permission to manipulate ZFS datasets + * (not pools). Equivalent to the SYS_MOUNT privilege. + */ +int +secpolicy_zfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES)); +} + +void +secpolicy_setid_clear(vattr_t *vap, cred_t *cr) +{ + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (vap->va_mode & S_ISUID) != 0 && + (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } +} + +/* + * Determine that subject can set the file setid flags. + */ +static int +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM)); +} + +/* + * Determine that subject can make a file a "sticky". + * + * Enforced in the Linux VFS. + */ +static int +secpolicy_vnode_stky_modify(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, + const vattr_t *ovap, cred_t *cr) +{ + int error; + + if ((vap->va_mode & S_ISUID) != 0 && + (error = secpolicy_vnode_setid_modify(cr, + ovap->va_uid)) != 0) { + return (error); + } + + /* + * Check privilege if attempting to set the + * sticky bit on a non-directory. + */ + if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && + secpolicy_vnode_stky_modify(cr) != 0) { + vap->va_mode &= ~S_ISVTX; + } + + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0 && + secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype) +{ + return (secpolicy_vnode_chown(cr, owner)); +} + +/* + * Check privileges for setattr attributes. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + return (0); +} + +/* + * Check privileges for links. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_basic_link(const cred_t *cr) +{ + return (0); +} diff --git a/module/os/linux/zfs/qat.c b/module/os/linux/zfs/qat.c new file mode 100644 index 000000000..a6f024cb4 --- /dev/null +++ b/module/os/linux/zfs/qat.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include <sys/zfs_context.h> +#include "qat.h" + +qat_stats_t qat_stats = { + { "comp_requests", KSTAT_DATA_UINT64 }, + { "comp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "comp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decomp_requests", KSTAT_DATA_UINT64 }, + { "decomp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decomp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "dc_fails", KSTAT_DATA_UINT64 }, + { "encrypt_requests", KSTAT_DATA_UINT64 }, + { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_requests", KSTAT_DATA_UINT64 }, + { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "crypt_fails", KSTAT_DATA_UINT64 }, + { "cksum_requests", KSTAT_DATA_UINT64 }, + { "cksum_total_in_bytes", KSTAT_DATA_UINT64 }, + { "cksum_fails", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *qat_ksp = NULL; + +CpaStatus +qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes) +{ + *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL); + if (*pp_mem_addr == NULL) + return (CPA_STATUS_RESOURCE); + return (CPA_STATUS_SUCCESS); +} + +void +qat_mem_free_contig(void **pp_mem_addr) +{ + if (*pp_mem_addr != NULL) { + kfree(*pp_mem_addr); + *pp_mem_addr = NULL; + } +} + +int +qat_init(void) +{ + qat_ksp = kstat_create("zfs", 0, "qat", "misc", + KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (qat_ksp != NULL) { + qat_ksp->ks_data = &qat_stats; + kstat_install(qat_ksp); + } + + /* + * Just set the disable flag when qat init failed, qat can be + * turned on again in post-process after zfs module is loaded, e.g.: + * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable + */ + if (qat_dc_init() != 0) + zfs_qat_compress_disable = 1; + + if (qat_cy_init() != 0) { + zfs_qat_checksum_disable = 1; + zfs_qat_encrypt_disable = 1; + } + + return (0); +} + +void +qat_fini(void) +{ + if (qat_ksp != NULL) { + kstat_delete(qat_ksp); + qat_ksp = NULL; + } + + qat_cy_fini(); + qat_dc_fini(); +} + +#endif diff --git a/module/os/linux/zfs/qat_compress.c b/module/os/linux/zfs/qat_compress.c new file mode 100644 index 000000000..4136b6555 --- /dev/null +++ b/module/os/linux/zfs/qat_compress.c @@ -0,0 +1,574 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/completion.h> +#include <sys/zfs_context.h> +#include <sys/byteorder.h> +#include <sys/zio.h> +#include "qat.h" + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instance and + * session arrays; the actual number of instances are defined in the + * QAT driver's configuration file. + */ +#define QAT_DC_MAX_INSTANCES 48 + +/* + * ZLIB head and foot size + */ +#define ZLIB_HEAD_SZ 2 +#define ZLIB_FOOT_SZ 4 + +static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES]; +static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES]; +static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES]; +static Cpa16U num_inst = 0; +static Cpa32U inst_num = 0; +static boolean_t qat_dc_init_done = B_FALSE; +int zfs_qat_compress_disable = 0; + +boolean_t +qat_dc_use_accel(size_t s_len) +{ + return (!zfs_qat_compress_disable && + qat_dc_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +static void +qat_dc_callback(void *p_callback, CpaStatus status) +{ + if (p_callback != NULL) + complete((struct completion *)p_callback); +} + +static void +qat_dc_clean(void) +{ + Cpa16U buff_num = 0; + Cpa16U num_inter_buff_lists = 0; + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcStopInstance(dc_inst_handles[i]); + QAT_PHYS_CONTIG_FREE(session_handles[i]); + /* free intermediate buffers */ + if (buffer_array[i] != NULL) { + cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + CpaBufferList *buffer_inter = + buffer_array[i][buff_num]; + if (buffer_inter->pBuffers) { + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers->pData); + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers); + } + QAT_PHYS_CONTIG_FREE( + buffer_inter->pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(buffer_inter); + } + } + } + + num_inst = 0; + qat_dc_init_done = B_FALSE; +} + +int +qat_dc_init(void) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U sess_size = 0; + Cpa32U ctx_size = 0; + Cpa16U num_inter_buff_lists = 0; + Cpa16U buff_num = 0; + Cpa32U buff_meta_size = 0; + CpaDcSessionSetupData sd = {0}; + + if (qat_dc_init_done) + return (0); + + status = cpaDcGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT compression units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_DC_MAX_INSTANCES) + num_inst = QAT_DC_MAX_INSTANCES; + + status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcSetAddressTranslation(dc_inst_handles[i], + (void*)virt_to_phys); + + status = cpaDcBufferListGetMetaSize(dc_inst_handles[i], + 1, &buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + + if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0) + status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i], + num_inter_buff_lists * + sizeof (CpaBufferList *)); + + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num], + sizeof (CpaBufferList)); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]-> + pPrivateMetaData, + buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers, + sizeof (CpaFlatBuffer)); + + if (status == CPA_STATUS_SUCCESS) { + /* + * implementation requires an intermediate + * buffer approximately twice the size of + * output buffer, which is 2x max buffer + * size here. + */ + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers-> + pData, 2 * QAT_MAX_BUF_SIZE); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + buffer_array[i][buff_num]->numBuffers = 1; + buffer_array[i][buff_num]->pBuffers-> + dataLenInBytes = 2 * QAT_MAX_BUF_SIZE; + } + } + + status = cpaDcStartInstance(dc_inst_handles[i], + num_inter_buff_lists, buffer_array[i]); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + sd.compLevel = CPA_DC_L1; + sd.compType = CPA_DC_DEFLATE; + sd.huffType = CPA_DC_HT_FULL_DYNAMIC; + sd.sessDirection = CPA_DC_DIR_COMBINED; + sd.sessState = CPA_DC_STATELESS; + sd.deflateWindowSize = 7; + sd.checksum = CPA_DC_ADLER32; + status = cpaDcGetSessionSize(dc_inst_handles[i], + &sd, &sess_size, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size); + if (session_handles[i] == NULL) + goto fail; + + status = cpaDcInitSession(dc_inst_handles[i], + session_handles[i], + &sd, NULL, qat_dc_callback); + if (status != CPA_STATUS_SUCCESS) + goto fail; + } + + qat_dc_init_done = B_TRUE; + return (0); +fail: + qat_dc_clean(); + return (-1); +} + +void +qat_dc_fini(void) +{ + if (!qat_dc_init_done) + return; + + qat_dc_clean(); +} + +/* + * The "add" parameter is an additional buffer which is passed + * to QAT as a scratch buffer alongside the destination buffer + * in case the "compressed" data ends up being larger than the + * original source data. This is necessary to prevent QAT from + * generating buffer overflow warnings for incompressible data. + */ +static int +qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, char *add, int add_len, size_t *c_len) +{ + CpaInstanceHandle dc_inst_handle; + CpaDcSessionHandle session_handle; + CpaBufferList *buf_list_src = NULL; + CpaBufferList *buf_list_dst = NULL; + CpaFlatBuffer *flat_buf_src = NULL; + CpaFlatBuffer *flat_buf_dst = NULL; + Cpa8U *buffer_meta_src = NULL; + Cpa8U *buffer_meta_dst = NULL; + Cpa32U buffer_meta_size = 0; + CpaDcRqResults dc_results; + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U hdr_sz = 0; + Cpa32U compressed_sz; + Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2; + Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2; + Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left; + Cpa32U dst_pages = 0; + Cpa32U adler32 = 0; + char *data; + struct page *page; + struct page **in_pages = NULL; + struct page **out_pages = NULL; + struct page **add_pages = NULL; + Cpa32U page_off = 0; + struct completion complete; + Cpa32U page_num = 0; + Cpa16U i; + + /* + * We increment num_src_buf and num_dst_buf by 2 to allow + * us to handle non page-aligned buffer addresses and buffers + * whose sizes are not divisible by PAGE_SIZE. + */ + Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) + + (num_src_buf * sizeof (CpaFlatBuffer)); + Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) + + ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer)); + + if (QAT_PHYS_CONTIG_ALLOC(&in_pages, + num_src_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) + goto fail; + + if (QAT_PHYS_CONTIG_ALLOC(&out_pages, + num_dst_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) + goto fail; + + if (QAT_PHYS_CONTIG_ALLOC(&add_pages, + num_add_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS) + goto fail; + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + dc_inst_handle = dc_inst_handles[i]; + session_handle = session_handles[i]; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf, + &buffer_meta_size); + if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size) != + CPA_STATUS_SUCCESS) + goto fail; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf, + &buffer_meta_size); + if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size) != + CPA_STATUS_SUCCESS) + goto fail; + + /* build source buffer list */ + if (QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size) != + CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1); + + buf_list_src->pBuffers = flat_buf_src; /* always point to first one */ + + /* build destination buffer list */ + if (QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size) != + CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + + buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */ + + buf_list_src->numBuffers = 0; + buf_list_src->pPrivateMetaData = buffer_meta_src; + bytes_left = src_len; + data = src; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + in_pages[page_num] = page; + flat_buf_src->pData = kmap(page) + page_off; + flat_buf_src->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_src->dataLenInBytes; + data += flat_buf_src->dataLenInBytes; + flat_buf_src++; + buf_list_src->numBuffers++; + page_num++; + } + + buf_list_dst->numBuffers = 0; + buf_list_dst->pPrivateMetaData = buffer_meta_dst; + bytes_left = dst_len; + data = dst; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + out_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + dst_pages++; + } + + /* map additional scratch pages into the destination buffer list */ + bytes_left = add_len; + data = add; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + add_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + } + + init_completion(&complete); + + if (dir == QAT_COMPRESS) { + QAT_STAT_BUMP(comp_requests); + QAT_STAT_INCR(comp_total_in_bytes, src_len); + + cpaDcGenerateHeader(session_handle, + buf_list_dst->pBuffers, &hdr_sz); + buf_list_dst->pBuffers->pData += hdr_sz; + buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz; + status = cpaDcCompressData( + dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, + &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + if (status != CPA_STATUS_SUCCESS) { + goto fail; + } + + /* we now wait until the completion of the operation. */ + if (!wait_for_completion_interruptible_timeout(&complete, + QAT_TIMEOUT_MS)) { + status = CPA_STATUS_FAIL; + goto fail; + } + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + compressed_sz = dc_results.produced; + if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + /* move to the last page */ + flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT; + + /* no space for gzip footer in the last page */ + if (((compressed_sz + hdr_sz) % PAGE_SIZE) + + ZLIB_FOOT_SZ > PAGE_SIZE) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + /* jump to the end of the buffer and append footer */ + flat_buf_dst->pData = + (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK) + + ((compressed_sz + hdr_sz) % PAGE_SIZE); + flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ; + + dc_results.produced = 0; + status = cpaDcGenerateFooter(session_handle, + flat_buf_dst, &dc_results); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + *c_len = compressed_sz + dc_results.produced + hdr_sz; + QAT_STAT_INCR(comp_total_out_bytes, *c_len); + } else { + ASSERT3U(dir, ==, QAT_DECOMPRESS); + QAT_STAT_BUMP(decomp_requests); + QAT_STAT_INCR(decomp_total_in_bytes, src_len); + + buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ; + buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ; + status = cpaDcDecompressData(dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + + if (CPA_STATUS_SUCCESS != status) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* we now wait until the completion of the operation. */ + if (!wait_for_completion_interruptible_timeout(&complete, + QAT_TIMEOUT_MS)) { + status = CPA_STATUS_FAIL; + goto fail; + } + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* verify adler checksum */ + adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ); + if (adler32 != BSWAP_32(dc_results.checksum)) { + status = CPA_STATUS_FAIL; + goto fail; + } + *c_len = dc_results.produced; + QAT_STAT_INCR(decomp_total_out_bytes, *c_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE) + QAT_STAT_BUMP(dc_fails); + + if (in_pages) { + for (page_num = 0; + page_num < buf_list_src->numBuffers; + page_num++) { + kunmap(in_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(in_pages); + } + + if (out_pages) { + for (page_num = 0; page_num < dst_pages; page_num++) { + kunmap(out_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(out_pages); + } + + if (add_pages) { + for (page_num = 0; + page_num < buf_list_dst->numBuffers - dst_pages; + page_num++) { + kunmap(add_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(add_pages); + } + + QAT_PHYS_CONTIG_FREE(buffer_meta_src); + QAT_PHYS_CONTIG_FREE(buffer_meta_dst); + QAT_PHYS_CONTIG_FREE(buf_list_src); + QAT_PHYS_CONTIG_FREE(buf_list_dst); + + return (status); +} + +/* + * Entry point for QAT accelerated compression / decompression. + */ +int +qat_compress(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, size_t *c_len) +{ + int ret; + size_t add_len = 0; + void *add = NULL; + + if (dir == QAT_COMPRESS) { + add_len = dst_len; + add = zio_data_buf_alloc(add_len); + } + + ret = qat_compress_impl(dir, src, src_len, dst, + dst_len, add, add_len, c_len); + + if (dir == QAT_COMPRESS) + zio_data_buf_free(add, add_len); + + return (ret); +} + +static int +param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_compress_disable = 0: enable qat compress + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_dc_init_done) { + ret = qat_dc_init(); + if (ret != 0) { + zfs_qat_compress_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_compress_disable, param_set_qat_compress, + param_get_int, &zfs_qat_compress_disable, 0644); +MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression"); + +#endif diff --git a/module/os/linux/zfs/qat_crypt.c b/module/os/linux/zfs/qat_crypt.c new file mode 100644 index 000000000..02e19d21d --- /dev/null +++ b/module/os/linux/zfs/qat_crypt.c @@ -0,0 +1,631 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This file represents the QAT implementation of checksums and encryption. + * Internally, QAT shares the same cryptographic instances for both of these + * operations, so the code has been combined here. QAT data compression uses + * compression instances, so that code is separated into qat_compress.c + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/pagemap.h> +#include <linux/completion.h> +#include <sys/zfs_context.h> +#include <sys/zio_crypt.h> +#include "lac/cpa_cy_im.h" +#include "lac/cpa_cy_common.h" +#include "qat.h" + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instances + * and session arrays; the actual number of instances are defined in + * the QAT driver's configure file. + */ +#define QAT_CRYPT_MAX_INSTANCES 48 + +#define MAX_PAGE_NUM 1024 + +static Cpa32U inst_num = 0; +static Cpa16U num_inst = 0; +static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES]; +static boolean_t qat_cy_init_done = B_FALSE; +int zfs_qat_encrypt_disable = 0; +int zfs_qat_checksum_disable = 0; + +typedef struct cy_callback { + CpaBoolean verify_result; + struct completion complete; +} cy_callback_t; + +static void +symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation, + void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify) +{ + cy_callback_t *cb = p_callback; + + if (cb != NULL) { + /* indicate that the function has been called */ + cb->verify_result = verify; + complete(&cb->complete); + } +} + +boolean_t +qat_crypt_use_accel(size_t s_len) +{ + return (!zfs_qat_encrypt_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +boolean_t +qat_checksum_use_accel(size_t s_len) +{ + return (!zfs_qat_checksum_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +void +qat_cy_clean(void) +{ + for (Cpa16U i = 0; i < num_inst; i++) + cpaCyStopInstance(cy_inst_handles[i]); + + num_inst = 0; + qat_cy_init_done = B_FALSE; +} + +int +qat_cy_init(void) +{ + CpaStatus status = CPA_STATUS_FAIL; + + if (qat_cy_init_done) + return (0); + + status = cpaCyGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT encryption units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_CRYPT_MAX_INSTANCES) + num_inst = QAT_CRYPT_MAX_INSTANCES; + + status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + status = cpaCySetAddressTranslation(cy_inst_handles[i], + (void *)virt_to_phys); + if (status != CPA_STATUS_SUCCESS) + goto error; + + status = cpaCyStartInstance(cy_inst_handles[i]); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + qat_cy_init_done = B_TRUE; + return (0); + +error: + qat_cy_clean(); + return (-1); +} + +void +qat_cy_fini(void) +{ + if (!qat_cy_init_done) + return; + + qat_cy_clean(); +} + +static CpaStatus +qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key, + Cpa64U crypt, Cpa32U aad_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U ciper_algorithm; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) { + return (CPA_STATUS_FAIL); + } else { + ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM; + hash_algorithm = CPA_CY_SYM_HASH_AES_GCM; + } + + sd.cipherSetupData.cipherAlgorithm = ciper_algorithm; + sd.cipherSetupData.pCipherKey = key->ck_data; + sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH; + sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN; + sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len; + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING; + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + if (dir == QAT_ENCRYPT) { + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER; + } else { + ASSERT3U(dir, ==, QAT_DECRYPT); + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH; + } + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + /* + * ZFS's SHA512 checksum is actually SHA512/256, which uses + * a different IV from standard SHA512. QAT does not support + * SHA512/256, so we can only support SHA256. + */ + if (cksum == ZIO_CHECKSUM_SHA256) + hash_algorithm = CPA_CY_SYM_HASH_SHA256; + else + return (CPA_STATUS_FAIL); + + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_HASH; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN; + sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t); + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs, + CpaBufferList *src, CpaBufferList *dst) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U meta_size = 0; + + status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + + if (src != dst) { + status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData, + meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + return (CPA_STATUS_SUCCESS); + +error: + QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData); + if (src != dst) + QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData); + + return (status); +} + +int +qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, + uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, + crypto_key_t *key, uint64_t crypt, uint32_t enc_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaBufferList dst_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + CpaFlatBuffer *flat_dst_buf_array = NULL; + CpaFlatBuffer *flat_dst_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + struct page *out_pages[MAX_PAGE_NUM]; + Cpa32U in_page_num = 0; + Cpa32U out_page_num = 0; + Cpa32U in_page_off = 0; + Cpa32U out_page_off = 0; + + if (dir == QAT_ENCRYPT) { + QAT_STAT_BUMP(encrypt_requests); + QAT_STAT_INCR(encrypt_total_in_bytes, enc_len); + } else { + QAT_STAT_BUMP(decrypt_requests); + QAT_STAT_INCR(decrypt_total_in_bytes, enc_len); + } + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_crypt_session_ctx(dir, cy_inst_handle, + &cy_session_ctx, key, crypt, aad_len); + if (status != CPA_STATUS_SUCCESS) { + /* don't count CCM as a failure since it's not supported */ + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM) + QAT_STAT_BUMP(crypt_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &dst_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult, + ZIO_DATA_MAC_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv, + ZIO_DATA_IV_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + if (aad_len > 0) { + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData, + aad_len); + if (status != CPA_STATUS_SUCCESS) + goto fail; + bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len); + } + + bytes_left = enc_len; + data = src_buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + in_page_off = ((long)data & ~PAGE_MASK); + in_pages[in_page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - in_page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + in_page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = in_page_num; + + bytes_left = enc_len; + data = dst_buf; + flat_dst_buf = flat_dst_buf_array; + while (bytes_left > 0) { + out_page_off = ((long)data & ~PAGE_MASK); + out_pages[out_page_num] = qat_mem_to_page(data); + flat_dst_buf->pData = kmap(out_pages[out_page_num]) + + out_page_off; + flat_dst_buf->dataLenInBytes = + min((long)PAGE_SIZE - out_page_off, (long)bytes_left); + data += flat_dst_buf->dataLenInBytes; + bytes_left -= flat_dst_buf->dataLenInBytes; + flat_dst_buf++; + out_page_num++; + } + dst_buffer_list.pBuffers = flat_dst_buf_array; + dst_buffer_list.numBuffers = out_page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.cryptoStartSrcOffsetInBytes = 0; + op_data.messageLenToCipherInBytes = 0; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = 0; + op_data.messageLenToCipherInBytes = enc_len; + op_data.ivLenInBytes = ZIO_DATA_IV_LEN; + bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &dst_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + if (!wait_for_completion_interruptible_timeout(&cb.complete, + QAT_TIMEOUT_MS)) { + status = CPA_STATUS_FAIL; + goto fail; + } + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* save digest result to digest_buf */ + bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); + if (dir == QAT_ENCRYPT) + QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); + else + QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(crypt_fails); + + for (i = 0; i < in_page_num; i++) + kunmap(in_pages[i]); + for (i = 0; i < out_page_num; i++) + kunmap(out_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + if (aad_len > 0) + QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData); + QAT_PHYS_CONTIG_FREE(op_data.pIv); + QAT_PHYS_CONTIG_FREE(op_data.pDigestResult); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + QAT_PHYS_CONTIG_FREE(flat_dst_buf_array); + + return (status); +} + +int +qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) +{ + CpaStatus status; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + Cpa8U *digest_buffer = NULL; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + Cpa32U page_num = 0; + Cpa32U page_off = 0; + + QAT_STAT_BUMP(cksum_requests); + QAT_STAT_INCR(cksum_total_in_bytes, size); + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_checksum_session_ctx(cy_inst_handle, + &cy_session_ctx, cksum); + if (status != CPA_STATUS_SUCCESS) { + /* don't count unsupported checksums as a failure */ + if (cksum == ZIO_CHECKSUM_SHA256 || + cksum == ZIO_CHECKSUM_SHA512) + QAT_STAT_BUMP(cksum_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &src_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer, + sizeof (zio_cksum_t)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + bytes_left = size; + data = buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + in_pages[page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[page_num]) + page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = size; + op_data.pDigestResult = digest_buffer; + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &src_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + if (!wait_for_completion_interruptible_timeout(&cb.complete, + QAT_TIMEOUT_MS)) { + status = CPA_STATUS_FAIL; + goto fail; + } + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + bcopy(digest_buffer, zcp, sizeof (zio_cksum_t)); + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(cksum_fails); + + for (i = 0; i < page_num; i++) + kunmap(in_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + QAT_PHYS_CONTIG_FREE(digest_buffer); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + + return (status); +} + +static int +param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_encrypt_disable = 0: enable qat encrypt + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_encrypt_disable = 1; + return (ret); + } + } + return (ret); +} + +static int +param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * set_checksum_param_ops = 0: enable qat checksum + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_checksum_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt, + param_get_int, &zfs_qat_encrypt_disable, 0644); +MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption"); + +module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum, + param_get_int, &zfs_qat_checksum_disable, 0644); +MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming"); + +#endif diff --git a/module/os/linux/zfs/spa_stats.c b/module/os/linux/zfs/spa_stats.c new file mode 100644 index 000000000..6895428f4 --- /dev/null +++ b/module/os/linux/zfs/spa_stats.c @@ -0,0 +1,1034 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> +#include <sys/spa.h> +#include <zfs_comutil.h> + +/* + * Keeps stats on last N reads per spa_t, disabled by default. + */ +int zfs_read_history = 0; + +/* + * Include cache hits in history, disabled by default. + */ +int zfs_read_history_hits = 0; + +/* + * Keeps stats on the last 100 txgs by default. + */ +int zfs_txg_history = 100; + +/* + * Keeps stats on the last N MMP updates, disabled by default. + */ +int zfs_multihost_history = 0; + +/* + * ========================================================================== + * SPA Read History Routines + * ========================================================================== + */ + +/* + * Read statistics - Information exported regarding each arc_read call + */ +typedef struct spa_read_history { + hrtime_t start; /* time read completed */ + uint64_t objset; /* read from this objset */ + uint64_t object; /* read of this object number */ + uint64_t level; /* block's indirection level */ + uint64_t blkid; /* read of this block id */ + char origin[24]; /* read originated from here */ + uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ + pid_t pid; /* PID of task doing read */ + char comm[16]; /* process name of task doing read */ + procfs_list_node_t srh_node; +} spa_read_history_t; + +static int +spa_read_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", + "level", "blkid", "aflags", "origin", "pid", "process"); + + return (0); +} + +static int +spa_read_history_show(struct seq_file *f, void *data) +{ + spa_read_history_t *srh = (spa_read_history_t *)data; + + seq_printf(f, "%-8llu %-16llu 0x%-6llx " + "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", + (u_longlong_t)srh->srh_node.pln_id, srh->start, + (longlong_t)srh->objset, (longlong_t)srh->object, + (longlong_t)srh->level, (longlong_t)srh->blkid, + srh->aflags, srh->origin, srh->pid, srh->comm); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_read_history_t *srh; + while (shl->size > size) { + srh = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(srh, !=, NULL); + kmem_free(srh, sizeof (spa_read_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); +} + +static int +spa_read_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_read_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_read_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.read_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "reads", + 0600, + &shl->procfs_list, + spa_read_history_show, + spa_read_history_show_header, + spa_read_history_clear, + offsetof(spa_read_history_t, srh_node)); + + strfree(module); +} + +static void +spa_read_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.read_history; + procfs_list_uninstall(&shl->procfs_list); + spa_read_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) +{ + spa_history_list_t *shl = &spa->spa_stats.read_history; + spa_read_history_t *srh; + + ASSERT3P(spa, !=, NULL); + ASSERT3P(zb, !=, NULL); + + if (zfs_read_history == 0 && shl->size == 0) + return; + + if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) + return; + + srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); + strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); + srh->start = gethrtime(); + srh->objset = zb->zb_objset; + srh->object = zb->zb_object; + srh->level = zb->zb_level; + srh->blkid = zb->zb_blkid; + srh->aflags = aflags; + srh->pid = getpid(); + + mutex_enter(&shl->procfs_list.pl_lock); + + procfs_list_add(&shl->procfs_list, srh); + shl->size++; + + spa_read_history_truncate(shl, zfs_read_history); + + mutex_exit(&shl->procfs_list.pl_lock); +} + +/* + * ========================================================================== + * SPA TXG History Routines + * ========================================================================== + */ + +/* + * Txg statistics - Information exported regarding each txg sync + */ + +typedef struct spa_txg_history { + uint64_t txg; /* txg id */ + txg_state_t state; /* active txg state */ + uint64_t nread; /* number of bytes read */ + uint64_t nwritten; /* number of bytes written */ + uint64_t reads; /* number of read operations */ + uint64_t writes; /* number of write operations */ + uint64_t ndirty; /* number of dirty bytes */ + hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ + procfs_list_node_t sth_node; +} spa_txg_history_t; + +static int +spa_txg_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " + "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", + "ndirty", "nread", "nwritten", "reads", "writes", + "otime", "qtime", "wtime", "stime"); + return (0); +} + +static int +spa_txg_history_show(struct seq_file *f, void *data) +{ + spa_txg_history_t *sth = (spa_txg_history_t *)data; + uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; + char state; + + switch (sth->state) { + case TXG_STATE_BIRTH: state = 'B'; break; + case TXG_STATE_OPEN: state = 'O'; break; + case TXG_STATE_QUIESCED: state = 'Q'; break; + case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; + case TXG_STATE_SYNCED: state = 'S'; break; + case TXG_STATE_COMMITTED: state = 'C'; break; + default: state = '?'; break; + } + + if (sth->times[TXG_STATE_OPEN]) + open = sth->times[TXG_STATE_OPEN] - + sth->times[TXG_STATE_BIRTH]; + + if (sth->times[TXG_STATE_QUIESCED]) + quiesce = sth->times[TXG_STATE_QUIESCED] - + sth->times[TXG_STATE_OPEN]; + + if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) + wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - + sth->times[TXG_STATE_QUIESCED]; + + if (sth->times[TXG_STATE_SYNCED]) + sync = sth->times[TXG_STATE_SYNCED] - + sth->times[TXG_STATE_WAIT_FOR_SYNC]; + + seq_printf(f, "%-8llu %-16llu %-5c %-12llu " + "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", + (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, + (u_longlong_t)sth->ndirty, + (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, + (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, + (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, + (u_longlong_t)sync); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_txg_history_t *sth; + while (shl->size > size) { + sth = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(sth, !=, NULL); + kmem_free(sth, sizeof (spa_txg_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); + +} + +static int +spa_txg_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_txg_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_txg_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "txgs", + 0644, + &shl->procfs_list, + spa_txg_history_show, + spa_txg_history_show_header, + spa_txg_history_clear, + offsetof(spa_txg_history_t, sth_node)); + + strfree(module); +} + +static void +spa_txg_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + procfs_list_uninstall(&shl->procfs_list); + spa_txg_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +/* + * Add a new txg to historical record. + */ +void +spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + + if (zfs_txg_history == 0 && shl->size == 0) + return; + + sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); + sth->txg = txg; + sth->state = TXG_STATE_OPEN; + sth->times[TXG_STATE_BIRTH] = birth_time; + + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, sth); + shl->size++; + spa_txg_history_truncate(shl, zfs_txg_history); + mutex_exit(&shl->procfs_list.pl_lock); +} + +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { + if (sth->txg == txg) { + sth->times[completed_state] = completed_time; + sth->state++; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Set txg IO stats. + */ +static int +spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { + if (sth->txg == txg) { + sth->nread = nread; + sth->nwritten = nwritten; + sth->reads = reads; + sth->writes = writes; + sth->ndirty = ndirty; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + txg_stat_t *ts; + + if (zfs_txg_history == 0) + return (NULL); + + ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs1); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + ts->txg = txg; + ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); + + return (ts); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + if (ts == NULL) + return; + + if (zfs_txg_history == 0) { + kmem_free(ts, sizeof (txg_stat_t)); + return; + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs2); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); + spa_txg_history_set_io(spa, ts->txg, + ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], + ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], + ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], + ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], + ts->ndirty); + + kmem_free(ts, sizeof (txg_stat_t)); +} + +/* + * ========================================================================== + * SPA TX Assign Histogram Routines + * ========================================================================== + */ + +/* + * Tx statistics - Information exported regarding dmu_tx_assign time. + */ + +/* + * When the kstat is written zero all buckets. When the kstat is read + * count the number of trailing buckets set to zero and update ks_ndata + * such that they are not output. + */ +static int +spa_tx_assign_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + int i; + + if (rw == KSTAT_WRITE) { + for (i = 0; i < shk->count; i++) + ((kstat_named_t *)shk->private)[i].value.ui64 = 0; + } + + for (i = shk->count; i > 0; i--) + if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0) + break; + + ksp->ks_ndata = i; + ksp->ks_data_size = i * sizeof (kstat_named_t); + + return (0); +} + +static void +spa_tx_assign_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + char *name; + kstat_named_t *ks; + kstat_t *ksp; + int i; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + shk->count = 42; /* power of two buckets for 1ns to 2,199s */ + shk->size = shk->count * sizeof (kstat_named_t); + shk->private = kmem_alloc(shk->size, KM_SLEEP); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + + for (i = 0; i < shk->count; i++) { + ks = &((kstat_named_t *)shk->private)[i]; + ks->data_type = KSTAT_DATA_UINT64; + ks->value.ui64 = 0; + (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", + (u_longlong_t)1 << i); + } + + ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", + KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); + shk->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = shk->private; + ksp->ks_ndata = shk->count; + ksp->ks_data_size = shk->size; + ksp->ks_private = spa; + ksp->ks_update = spa_tx_assign_update; + kstat_install(ksp); + } + strfree(name); +} + +static void +spa_tx_assign_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + kstat_t *ksp; + + ksp = shk->kstat; + if (ksp) + kstat_delete(ksp); + + kmem_free(shk->private, shk->size); + mutex_destroy(&shk->lock); +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + uint64_t idx = 0; + + while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) + idx++; + + atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64); +} + +/* + * ========================================================================== + * SPA IO History Routines + * ========================================================================== + */ +static int +spa_io_history_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + memset(ksp->ks_data, 0, ksp->ks_data_size); + + return (0); +} + +static void +spa_io_history_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + char *name; + kstat_t *ksp; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + + ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); + shk->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_private = spa; + ksp->ks_update = spa_io_history_update; + kstat_install(ksp); + } + strfree(name); +} + +static void +spa_io_history_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + + if (shk->kstat) + kstat_delete(shk->kstat); + + mutex_destroy(&shk->lock); +} + +/* + * ========================================================================== + * SPA MMP History Routines + * ========================================================================== + */ + +/* + * MMP statistics - Information exported regarding attempted MMP writes + * For MMP writes issued, fields used as per comments below. + * For MMP writes skipped, an entry represents a span of time when + * writes were skipped for same reason (error from mmp_random_leaf). + * Differences are: + * timestamp time first write skipped, if >1 skipped in a row + * mmp_delay delay value at timestamp + * vdev_guid number of writes skipped + * io_error one of enum mmp_error + * duration time span (ns) of skipped writes + */ + +typedef struct spa_mmp_history { + uint64_t mmp_node_id; /* unique # for updates */ + uint64_t txg; /* txg of last sync */ + uint64_t timestamp; /* UTC time MMP write issued */ + uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ + uint64_t vdev_guid; /* unique ID of leaf vdev */ + char *vdev_path; + int vdev_label; /* vdev label */ + int io_error; /* error status of MMP write */ + hrtime_t error_start; /* hrtime of start of error period */ + hrtime_t duration; /* time from submission to completion */ + procfs_list_node_t smh_node; +} spa_mmp_history_t; + +static int +spa_mmp_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " + "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", + "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); + return (0); +} + +static int +spa_mmp_history_show(struct seq_file *f, void *data) +{ + spa_mmp_history_t *smh = (spa_mmp_history_t *)data; + char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " + "%-10lld %s\n"; + char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " + "%-10lld %s\n"; + + seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), + (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, + (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, + (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, + (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, + (smh->vdev_path ? smh->vdev_path : "-")); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_mmp_history_t *smh; + while (shl->size > size) { + smh = list_remove_head(&shl->procfs_list.pl_list); + if (smh->vdev_path) + strfree(smh->vdev_path); + kmem_free(smh, sizeof (spa_mmp_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); + +} + +static int +spa_mmp_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_mmp_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_mmp_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "multihost", + 0644, + &shl->procfs_list, + spa_mmp_history_show, + spa_mmp_history_show_header, + spa_mmp_history_clear, + offsetof(spa_mmp_history_t, smh_node)); + + strfree(module); +} + +static void +spa_mmp_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + procfs_list_uninstall(&shl->procfs_list); + spa_mmp_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +/* + * Set duration in existing "skip" record to how long we have waited for a leaf + * vdev to become available. + * + * Important that we start search at the tail of the list where new + * records are inserted, so this is normally an O(1) operation. + */ +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + int error = ENOENT; + + if (zfs_multihost_history == 0 && shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { + ASSERT3U(smh->io_error, !=, 0); + smh->duration = gethrtime() - smh->error_start; + smh->vdev_guid++; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Set MMP write duration and error status in existing record. + * See comment re: search order above spa_mmp_history_set_skip(). + */ +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + int error = ENOENT; + + if (zfs_multihost_history == 0 && shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { + ASSERT(smh->io_error == 0); + smh->io_error = io_error; + smh->duration = duration; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Add a new MMP historical record. + * error == 0 : a write was issued. + * error != 0 : a write was not issued because no leaves were found. + */ +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + + if (zfs_multihost_history == 0 && shl->size == 0) + return; + + smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); + smh->txg = txg; + smh->timestamp = timestamp; + smh->mmp_delay = mmp_delay; + if (vd) { + smh->vdev_guid = vd->vdev_guid; + if (vd->vdev_path) + smh->vdev_path = strdup(vd->vdev_path); + } + smh->vdev_label = label; + smh->mmp_node_id = mmp_node_id; + + if (error) { + smh->io_error = error; + smh->error_start = gethrtime(); + smh->vdev_guid = 1; + } + + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, smh); + shl->size++; + spa_mmp_history_truncate(shl, zfs_multihost_history); + mutex_exit(&shl->procfs_list.pl_lock); +} + +static void * +spa_state_addr(kstat_t *ksp, loff_t n) +{ + return (ksp->ks_private); /* return the spa_t */ +} + +static int +spa_state_data(char *buf, size_t size, void *data) +{ + spa_t *spa = (spa_t *)data; + (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); + return (0); +} + +/* + * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. + * + * This is a lock-less read of the pool's state (unlike using 'zpool', which + * can potentially block for seconds). Because it doesn't block, it can useful + * as a pool heartbeat value. + */ +static void +spa_state_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.state; + char *name; + kstat_t *ksp; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + ksp = kstat_create(name, 0, "state", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; + kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); + kstat_install(ksp); + } + + strfree(name); +} + +static void +spa_health_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.state; + kstat_t *ksp = shk->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&shk->lock); +} + +static spa_iostats_t spa_iostats_template = { + { "trim_extents_written", KSTAT_DATA_UINT64 }, + { "trim_bytes_written", KSTAT_DATA_UINT64 }, + { "trim_extents_skipped", KSTAT_DATA_UINT64 }, + { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "trim_extents_failed", KSTAT_DATA_UINT64 }, + { "trim_bytes_failed", KSTAT_DATA_UINT64 }, + { "autotrim_extents_written", KSTAT_DATA_UINT64 }, + { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, + { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, + { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, + { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, +}; + +#define SPA_IOSTATS_ADD(stat, val) \ + atomic_add_64(&iostats->stat.value.ui64, (val)); + +void +spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + spa_iostats_t *iostats; + + if (ksp == NULL) + return; + + iostats = ksp->ks_data; + if (type == TRIM_TYPE_MANUAL) { + SPA_IOSTATS_ADD(trim_extents_written, extents_written); + SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); + } else { + SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); + SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); + } +} + +int +spa_iostats_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) { + memcpy(ksp->ks_data, &spa_iostats_template, + sizeof (spa_iostats_t)); + } + + return (0); +} + +static void +spa_iostats_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + char *name = kmem_asprintf("zfs/%s", spa_name(spa)); + kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", + KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + int size = sizeof (spa_iostats_t); + ksp->ks_lock = &shk->lock; + ksp->ks_private = spa; + ksp->ks_update = spa_iostats_update; + ksp->ks_data = kmem_alloc(size, KM_SLEEP); + memcpy(ksp->ks_data, &spa_iostats_template, size); + kstat_install(ksp); + } + + strfree(name); +} + +static void +spa_iostats_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + if (ksp) { + kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); + kstat_delete(ksp); + } + + mutex_destroy(&shk->lock); +} + +void +spa_stats_init(spa_t *spa) +{ + spa_read_history_init(spa); + spa_txg_history_init(spa); + spa_tx_assign_init(spa); + spa_io_history_init(spa); + spa_mmp_history_init(spa); + spa_state_init(spa); + spa_iostats_init(spa); +} + +void +spa_stats_destroy(spa_t *spa) +{ + spa_iostats_destroy(spa); + spa_health_destroy(spa); + spa_tx_assign_destroy(spa); + spa_txg_history_destroy(spa); + spa_read_history_destroy(spa); + spa_io_history_destroy(spa); + spa_mmp_history_destroy(spa); +} + +#if defined(_KERNEL) +/* CSTYLED */ +module_param(zfs_read_history, int, 0644); +MODULE_PARM_DESC(zfs_read_history, + "Historical statistics for the last N reads"); + +module_param(zfs_read_history_hits, int, 0644); +MODULE_PARM_DESC(zfs_read_history_hits, + "Include cache hits in read history"); + +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, + "Historical statistics for the last N txgs"); + +module_param(zfs_multihost_history, int, 0644); +MODULE_PARM_DESC(zfs_multihost_history, + "Historical statistics for last N multihost writes"); +/* END CSTYLED */ +#endif diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c new file mode 100644 index 000000000..21f9ae454 --- /dev/null +++ b/module/os/linux/zfs/vdev_disk.c @@ -0,0 +1,954 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Rewritten for Linux by Brian Behlendorf <[email protected]>. + * LLNL-CODE-403049. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa_impl.h> +#include <sys/vdev_disk.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_trim.h> +#include <sys/abd.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <linux/msdos_fs.h> +#include <linux/vfs_compat.h> + +char *zfs_vdev_scheduler = VDEV_SCHEDULER; +static void *zfs_vdev_holder = VDEV_HOLDER; + +/* size of the "reserved" partition, in blocks */ +#define EFI_MIN_RESV_SIZE (16 * 1024) + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + zio_t *dr_zio; /* Parent ZIO */ + atomic_t dr_ref; /* References */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[0]; /* Attached bio's */ +} dio_request_t; + + +#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH) +static fmode_t +vdev_bdev_mode(int smode) +{ + fmode_t mode = 0; + + ASSERT3S(smode & (FREAD | FWRITE), !=, 0); + + if (smode & FREAD) + mode |= FMODE_READ; + + if (smode & FWRITE) + mode |= FMODE_WRITE; + + return (mode); +} +#else +static int +vdev_bdev_mode(int smode) +{ + int mode = 0; + + ASSERT3S(smode & (FREAD | FWRITE), !=, 0); + + if ((smode & FREAD) && !(smode & FWRITE)) + mode = SB_RDONLY; + + return (mode); +} +#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */ + +/* + * Returns the usable capacity (in bytes) for the partition or disk. + */ +static uint64_t +bdev_capacity(struct block_device *bdev) +{ + return (i_size_read(bdev->bd_inode)); +} + +/* + * Returns the maximum expansion capacity of the block device (in bytes). + * + * It is possible to expand a vdev when it has been created as a wholedisk + * and the containing block device has increased in capacity. Or when the + * partition containing the pool has been manually increased in size. + * + * This function is only responsible for calculating the potential expansion + * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is + * responsible for verifying the expected partition layout in the wholedisk + * case, and updating the partition table if appropriate. Once the partition + * size has been increased the additional capacity will be visible using + * bdev_capacity(). + * + * The returned maximum expansion capacity is always expected to be larger, or + * at the very least equal, to its usable capacity to prevent overestimating + * the pool expandsize. + */ +static uint64_t +bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) +{ + uint64_t psize; + int64_t available; + + if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + /* + * When reporting maximum expansion capacity for a wholedisk + * deduct any capacity which is expected to be lost due to + * alignment restrictions. Over reporting this value isn't + * harmful and would only result in slightly less capacity + * than expected post expansion. + * The estimated available space may be slightly smaller than + * bdev_capacity() for devices where the number of sectors is + * not a multiple of the alignment size and the partition layout + * is keeping less than PARTITION_END_ALIGNMENT bytes after the + * "reserved" EFI partition: in such cases return the device + * usable capacity. + */ + available = i_size_read(bdev->bd_contains->bd_inode) - + ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + + PARTITION_END_ALIGNMENT) << SECTOR_BITS); + psize = MAX(available, bdev_capacity(bdev)); + } else { + psize = bdev_capacity(bdev); + } + + return (psize); +} + +static void +vdev_disk_error(zio_t *zio) +{ + /* + * This function can be called in interrupt context, for instance while + * handling IRQs coming from a misbehaving disk device; use printk() + * which is safe from any context. + */ + printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " + "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), + zio->io_vd->vdev_path, zio->io_error, zio->io_type, + (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, + zio->io_flags); +} + +/* + * Use the Linux 'noop' elevator for zfs managed block devices. This + * strikes the ideal balance by allowing the zfs elevator to do all + * request ordering and prioritization. While allowing the Linux + * elevator to do the maximum front/back merging allowed by the + * physical device. This yields the largest possible requests for + * the device with the lowest total overhead. + */ +static void +vdev_elevator_switch(vdev_t *v, char *elevator) +{ + vdev_disk_t *vd = v->vdev_tsd; + struct request_queue *q; + char *device; + int error; + + for (int c = 0; c < v->vdev_children; c++) + vdev_elevator_switch(v->vdev_child[c], elevator); + + if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL) + return; + + q = bdev_get_queue(vd->vd_bdev); + device = vd->vd_bdev->bd_disk->disk_name; + + /* + * Skip devices which are not whole disks (partitions). + * Device-mapper devices are excepted since they may be whole + * disks despite the vdev_wholedisk flag, in which case we can + * and should switch the elevator. If the device-mapper device + * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the + * "Skip devices without schedulers" check below will fail. + */ + if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0) + return; + + /* Leave existing scheduler when set to "none" */ + if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4)) + return; + + /* + * The elevator_change() function was available in kernels from + * 2.6.36 to 4.11. When not available fall back to using the user + * mode helper functionality to set the elevator via sysfs. This + * requires /bin/echo and sysfs to be mounted which may not be true + * early in the boot process. + */ +#ifdef HAVE_ELEVATOR_CHANGE + error = elevator_change(q, elevator); +#else +#define SET_SCHEDULER_CMD \ + "exec 0</dev/null " \ + " 1>/sys/block/%s/queue/scheduler " \ + " 2>/dev/null; " \ + "echo %s" + + char *argv[] = { "/bin/sh", "-c", NULL, NULL }; + char *envp[] = { NULL }; + + argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator); + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + strfree(argv[2]); +#endif /* HAVE_ELEVATOR_CHANGE */ + if (error) { + zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d", + elevator, v->vdev_path, device, error); + } +} + +static int +vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + struct block_device *bdev; + fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + int count = 0, block_size; + int bdev_retry_count = 50; + vdev_disk_t *vd; + + /* Must have a pathname and it must be absolute. */ + if (v->vdev_path == NULL || v->vdev_path[0] != '/') { + v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + vdev_dbgmsg(v, "invalid vdev_path"); + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it is currently open. When expanding a + * partition force re-scanning the partition table while closed + * in order to get an accurate updated block device size. Then + * since udev may need to recreate the device links increase the + * open retry count before reporting the device as unavailable. + */ + vd = v->vdev_tsd; + if (vd) { + char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; + boolean_t reread_part = B_FALSE; + + rw_enter(&vd->vd_lock, RW_WRITER); + bdev = vd->vd_bdev; + vd->vd_bdev = NULL; + + if (bdev) { + if (v->vdev_expanding && bdev != bdev->bd_contains) { + bdevname(bdev->bd_contains, disk_name + 5); + reread_part = B_TRUE; + } + + vdev_bdev_close(bdev, mode); + } + + if (reread_part) { + bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder); + if (!IS_ERR(bdev)) { + int error = vdev_bdev_reread_part(bdev); + vdev_bdev_close(bdev, mode); + if (error == 0) + bdev_retry_count = 100; + } + } + } else { + vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); + rw_enter(&vd->vd_lock, RW_WRITER); + } + + /* + * Devices are always opened by the path provided at configuration + * time. This means that if the provided path is a udev by-id path + * then drives may be re-cabled without an issue. If the provided + * path is a udev by-path path, then the physical location information + * will be preserved. This can be critical for more complicated + * configurations where drives are located in specific physical + * locations to maximize the systems tolerance to component failure. + * + * Alternatively, you can provide your own udev rule to flexibly map + * the drives as you see fit. It is not advised that you use the + * /dev/[hd]d devices which may be reordered due to probing order. + * Devices in the wrong locations will be detected by the higher + * level vdev validation. + * + * The specified paths may be briefly removed and recreated in + * response to udev events. This should be exceptionally unlikely + * because the zpool command makes every effort to verify these paths + * have already settled prior to reaching this point. Therefore, + * a ENOENT failure at this point is highly likely to be transient + * and it is reasonable to sleep and retry before giving up. In + * practice delays have been observed to be on the order of 100ms. + */ + bdev = ERR_PTR(-ENXIO); + while (IS_ERR(bdev) && count < bdev_retry_count) { + bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder); + if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + schedule_timeout(MSEC_TO_TICK(10)); + count++; + } else if (IS_ERR(bdev)) { + break; + } + } + + if (IS_ERR(bdev)) { + int error = -PTR_ERR(bdev); + vdev_dbgmsg(v, "open error=%d count=%d", error, count); + vd->vd_bdev = NULL; + v->vdev_tsd = vd; + rw_exit(&vd->vd_lock); + return (SET_ERROR(error)); + } else { + vd->vd_bdev = bdev; + v->vdev_tsd = vd; + rw_exit(&vd->vd_lock); + } + + struct request_queue *q = bdev_get_queue(vd->vd_bdev); + + /* Determine the physical block size */ + block_size = vdev_bdev_block_size(vd->vd_bdev); + + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + v->vdev_nowritecache = B_FALSE; + + /* Set when device reports it supports TRIM. */ + v->vdev_has_trim = !!blk_queue_discard(q); + + /* Set when device reports it supports secure TRIM. */ + v->vdev_has_securetrim = !!blk_queue_discard_secure(q); + + /* Inform the ZIO pipeline that we are non-rotational */ + v->vdev_nonrot = blk_queue_nonrot(q); + + /* Physical volume size in bytes for the partition */ + *psize = bdev_capacity(vd->vd_bdev); + + /* Physical volume size in bytes including possible expansion space */ + *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); + + /* Based on the minimum sector size set the block size */ + *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1; + + /* Try to set the io scheduler elevator algorithm */ + (void) vdev_elevator_switch(v, zfs_vdev_scheduler); + + return (0); +} + +static void +vdev_disk_close(vdev_t *v) +{ + vdev_disk_t *vd = v->vdev_tsd; + + if (v->vdev_reopening || vd == NULL) + return; + + if (vd->vd_bdev != NULL) { + vdev_bdev_close(vd->vd_bdev, + vdev_bdev_mode(spa_mode(v->vdev_spa))); + } + + rw_destroy(&vd->vd_lock); + kmem_free(vd, sizeof (vdev_disk_t)); + v->vdev_tsd = NULL; +} + +static dio_request_t * +vdev_disk_dio_alloc(int bio_count) +{ + dio_request_t *dr; + int i; + + dr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * bio_count, KM_SLEEP); + if (dr) { + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; + } + + return (dr); +} + +static void +vdev_disk_dio_free(dio_request_t *dr) +{ + int i; + + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); +} + +static void +vdev_disk_dio_get(dio_request_t *dr) +{ + atomic_inc(&dr->dr_ref); +} + +static int +vdev_disk_dio_put(dio_request_t *dr) +{ + int rc = atomic_dec_return(&dr->dr_ref); + + /* + * Free the dio_request when the last reference is dropped and + * ensure zio_interpret is called only once with the correct zio + */ + if (rc == 0) { + zio_t *zio = dr->dr_zio; + int error = dr->dr_error; + + vdev_disk_dio_free(dr); + + if (zio) { + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); + } + } + + return (rc); +} + +BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + int rc; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* Drop reference acquired by __vdev_disk_physio */ + rc = vdev_disk_dio_put(dr); +} + +static unsigned int +bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) +{ + unsigned int offset, size, i; + struct page *page; + + offset = offset_in_page(bio_ptr); + for (i = 0; i < bio->bi_max_vecs; i++) { + size = PAGE_SIZE - offset; + + if (bio_size <= 0) + break; + + if (size > bio_size) + size = bio_size; + + if (is_vmalloc_addr(bio_ptr)) + page = vmalloc_to_page(bio_ptr); + else + page = virt_to_page(bio_ptr); + + /* + * Some network related block device uses tcp_sendpage, which + * doesn't behave well when using 0-count page, this is a + * safety net to catch them. + */ + ASSERT3S(page_count(page), >, 0); + + if (bio_add_page(bio, page, size, offset) != size) + break; + + bio_ptr += size; + bio_size -= size; + offset = 0; + } + + return (bio_size); +} + +static unsigned int +bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) +{ + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); + + return (abd_scatter_bio_map_off(bio, abd, size, off)); +} + +static inline void +vdev_submit_bio_impl(struct bio *bio) +{ +#ifdef HAVE_1ARG_SUBMIT_BIO + submit_bio(bio); +#else + submit_bio(0, bio); +#endif +} + +#ifdef HAVE_BIO_SET_DEV +#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) +/* + * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the + * GPL-only bio_associate_blkg() symbol thus inadvertently converting + * the entire macro. Provide a minimal version which always assigns the + * request queue's root_blkg to the bio. + */ +static inline void +vdev_bio_associate_blkg(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + + ASSERT3P(q, !=, NULL); + ASSERT3P(bio->bi_blkg, ==, NULL); + + if (blkg_tryget(q->root_blkg)) + bio->bi_blkg = q->root_blkg; +} +#define bio_associate_blkg vdev_bio_associate_blkg +#endif +#else +/* + * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. + */ +static inline void +bio_set_dev(struct bio *bio, struct block_device *bdev) +{ + bio->bi_bdev = bdev; +} +#endif /* HAVE_BIO_SET_DEV */ + +static inline void +vdev_submit_bio(struct bio *bio) +{ +#ifdef HAVE_CURRENT_BIO_TAIL + struct bio **bio_tail = current->bio_tail; + current->bio_tail = NULL; + vdev_submit_bio_impl(bio); + current->bio_tail = bio_tail; +#else + struct bio_list *bio_list = current->bio_list; + current->bio_list = NULL; + vdev_submit_bio_impl(bio); + current->bio_list = bio_list; +#endif +} + +static int +__vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags) +{ + dio_request_t *dr; + uint64_t abd_offset; + uint64_t bio_offset; + int bio_size, bio_count = 16; + int i = 0, error = 0; +#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) + struct blk_plug plug; +#endif + /* + * Accessing outside the block device is never allowed. + */ + if (io_offset + io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + io_offset, io_size, i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } + +retry: + dr = vdev_disk_dio_alloc(bio_count); + if (dr == NULL) + return (SET_ERROR(ENOMEM)); + + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bio_set_flags_failfast(bdev, &flags); + + dr->dr_zio = zio; + + /* + * When the IO size exceeds the maximum bio size for the request + * queue we are forced to break the IO in multiple bio's and wait + * for them all to complete. Ideally, all pool users will set + * their volume block size to match the maximum request size and + * the common case will be one bio per vdev IO request. + */ + + abd_offset = 0; + bio_offset = io_offset; + bio_size = io_size; + for (i = 0; i <= dr->dr_bio_count; i++) { + + /* Finished constructing bio's for given buffer */ + if (bio_size <= 0) + break; + + /* + * By default only 'bio_count' bio's per dio are allowed. + * However, if we find ourselves in a situation where more + * are needed we allocate a larger dio and warn the user. + */ + if (dr->dr_bio_count == i) { + vdev_disk_dio_free(dr); + bio_count *= 2; + goto retry; + } + + /* bio_alloc() with __GFP_WAIT never returns NULL */ + dr->dr_bio[i] = bio_alloc(GFP_NOIO, + MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), + BIO_MAX_PAGES)); + if (unlikely(dr->dr_bio[i] == NULL)) { + vdev_disk_dio_free(dr); + return (SET_ERROR(ENOMEM)); + } + + /* Matching put called by vdev_disk_physio_completion */ + vdev_disk_dio_get(dr); + + bio_set_dev(dr->dr_bio[i], bdev); + BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; + dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_private = dr; + bio_set_op_attrs(dr->dr_bio[i], rw, flags); + + /* Remaining size is returned to become the new size */ + bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size, abd_offset); + + /* Advance in buffer and construct another bio if needed */ + abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); + bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); + } + + /* Extra reference to protect dio_request during vdev_submit_bio */ + vdev_disk_dio_get(dr); + +#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) + if (dr->dr_bio_count > 1) + blk_start_plug(&plug); +#endif + + /* Submit all bio's associated with this dio */ + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + vdev_submit_bio(dr->dr_bio[i]); + +#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG) + if (dr->dr_bio_count > 1) + blk_finish_plug(&plug); +#endif + + (void) vdev_disk_dio_put(dr); + + return (error); +} + +BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + + if (zio->io_error && (zio->io_error == EOPNOTSUPP)) + zio->io_vd->vdev_nowritecache = B_TRUE; + + bio_put(bio); + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + +static int +vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) +{ + struct request_queue *q; + struct bio *bio; + + q = bdev_get_queue(bdev); + if (!q) + return (SET_ERROR(ENXIO)); + + bio = bio_alloc(GFP_NOIO, 0); + /* bio_alloc() with __GFP_WAIT never returns NULL */ + if (unlikely(bio == NULL)) + return (SET_ERROR(ENOMEM)); + + bio->bi_end_io = vdev_disk_io_flush_completion; + bio->bi_private = zio; + bio_set_dev(bio, bdev); + bio_set_flush(bio); + vdev_submit_bio(bio); + invalidate_bdev(bdev); + + return (0); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + unsigned long trim_flags = 0; + int rw, flags, error; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vd == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + rw_enter(&vd->vd_lock, RW_READER); + + /* + * If the vdev is closed, it's likely due to a failed reopen and is + * in the UNAVAIL state. Nothing to be done here but return failure. + */ + if (vd->vd_bdev == NULL) { + rw_exit(&vd->vd_lock); + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + + if (!vdev_readable(v)) { + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (v->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + error = vdev_disk_io_flush(vd->vd_bdev, zio); + if (error == 0) { + rw_exit(&vd->vd_lock); + return; + } + + zio->io_error = error; + + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + rw_exit(&vd->vd_lock); + zio_execute(zio); + return; + case ZIO_TYPE_WRITE: + rw = WRITE; +#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) + flags = (1 << BIO_RW_UNPLUG); +#elif defined(REQ_UNPLUG) + flags = REQ_UNPLUG; +#else + flags = 0; +#endif + break; + + case ZIO_TYPE_READ: + rw = READ; +#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG) + flags = (1 << BIO_RW_UNPLUG); +#elif defined(REQ_UNPLUG) + flags = REQ_UNPLUG; +#else + flags = 0; +#endif + break; + + case ZIO_TYPE_TRIM: +#if defined(BLKDEV_DISCARD_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + trim_flags |= BLKDEV_DISCARD_SECURE; +#endif + zio->io_error = -blkdev_issue_discard(vd->vd_bdev, + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, + trim_flags); + + rw_exit(&vd->vd_lock); + zio_interrupt(zio); + return; + + default: + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENOTSUP); + zio_interrupt(zio); + return; + } + + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = __vdev_disk_physio(vd->vd_bdev, zio, + zio->io_size, zio->io_offset, rw, flags); + rw_exit(&vd->vd_lock); + + if (error) { + zio->io_error = error; + zio_interrupt(zio); + return; + } +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + /* + * If the device returned EIO, we revalidate the media. If it is + * determined the media has changed this triggers the asynchronous + * removal of the device from the configuration. + */ + if (zio->io_error == EIO) { + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + + if (check_disk_change(vd->vd_bdev)) { + vdev_bdev_invalidate(vd->vd_bdev); + v->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } +} + +static void +vdev_disk_hold(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* We must have a pathname, and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + + /* XXX: Implement me as a vnode lookup for the device */ + vd->vdev_name_vp = NULL; + vd->vdev_devid_vp = NULL; +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* XXX: Implement me as a vnode rele for the device */ +} + +static int +param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) +{ + spa_t *spa = NULL; + char *p; + + if (val == NULL) + return (SET_ERROR(-EINVAL)); + + if ((p = strchr(val, '\n')) != NULL) + *p = '\0'; + + if (spa_mode_global != 0) { + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) + continue; + + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + vdev_elevator_switch(spa->spa_root_vdev, (char *)val); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); + } + + return (param_set_charp(val, kp)); +} + +vdev_ops_t vdev_disk_ops = { + .vdev_op_open = vdev_disk_open, + .vdev_op_close = vdev_disk_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_disk_io_start, + .vdev_op_io_done = vdev_disk_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_disk_hold, + .vdev_op_rele = vdev_disk_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, + param_get_charp, &zfs_vdev_scheduler, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c new file mode 100644 index 000000000..b79017f3a --- /dev/null +++ b/module/os/linux/zfs/vdev_file.c @@ -0,0 +1,331 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev_file.h> +#include <sys/vdev_impl.h> +#include <sys/vdev_trim.h> +#include <sys/zio.h> +#include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> +#include <sys/abd.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + vdev_file_t *vf; + vnode_t *vp; + vattr_t vattr; + int error; + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, + spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_vnode = vp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (vp->v_type != VREG) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (SET_ERROR(ENODEV)); + } +#endif + +skip_open: + /* + * Determine the physical size of the file. + */ + vattr.va_mask = AT_SIZE; + error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = vattr.va_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_vnode != NULL) { + (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, + kcred, NULL); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + void *buf; + + if (zio->io_type == ZIO_TYPE_READ) + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + else + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + + zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? + UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size, + zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, buf, zio->io_size); + else + abd_return_buf(zio->io_abd, buf, zio->io_size); + + if (resid != 0 && zio->io_error == 0) + zio->io_error = SET_ERROR(ENOSPC); + + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_fsync(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_file_t *vf = zio->io_vd->vdev_tsd; + + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL); + + zio_interrupt(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + /* + * We cannot safely call vfs_fsync() when PF_FSTRANS + * is set in the current context. Filesystems like + * XFS include sanity checks to verify it is not + * already set, see xfs_vm_writepage(). Therefore + * the sync must be dispatched to a different context. + */ + if (__spl_pf_fstrans_check()) { + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, + kcred, NULL); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { + struct flock flck; + + ASSERT3U(zio->io_size, !=, 0); + bzero(&flck, sizeof (flck)); + flck.l_type = F_FREESP; + flck.l_start = zio->io_offset; + flck.l_len = zio->io_size; + flck.l_whence = SEEK_SET; + + zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck, + 0, 0, kcred, NULL); + + zio_execute(zio); + return; + } + + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, TASKQID_INVALID); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), + minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c new file mode 100644 index 000000000..26af91e27 --- /dev/null +++ b/module/os/linux/zfs/zfs_acl.c @@ -0,0 +1,2816 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/sid.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/sdt.h> +#include <sys/fs/zfs.h> +#include <sys/mode.h> +#include <sys/policy.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_vfsops.h> +#include <sys/dmu.h> +#include <sys/dnode.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/trace_acl.h> +#include <sys/zpl.h> + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + .ace_mask_get = zfs_ace_v0_get_mask, + .ace_mask_set = zfs_ace_v0_set_mask, + .ace_flags_get = zfs_ace_v0_get_flags, + .ace_flags_set = zfs_ace_v0_set_flags, + .ace_type_get = zfs_ace_v0_get_type, + .ace_type_set = zfs_ace_v0_set_type, + .ace_who_get = zfs_ace_v0_get_who, + .ace_who_set = zfs_ace_v0_set_who, + .ace_size = zfs_ace_v0_size, + .ace_abstract_size = zfs_ace_v0_abstract_size, + .ace_mask_off = zfs_ace_v0_mask_off, + .ace_data = zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + .ace_mask_get = zfs_ace_fuid_get_mask, + .ace_mask_set = zfs_ace_fuid_set_mask, + .ace_flags_get = zfs_ace_fuid_get_flags, + .ace_flags_set = zfs_ace_fuid_set_flags, + .ace_type_get = zfs_ace_fuid_get_type, + .ace_type_set = zfs_ace_fuid_set_type, + .ace_who_get = zfs_ace_fuid_get_who, + .ace_who_set = zfs_ace_fuid_set_who, + .ace_size = zfs_ace_fuid_size, + .ace_abstract_size = zfs_ace_fuid_abstract_size, + .ace_mask_off = zfs_ace_fuid_mask_off, + .ace_data = zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(ZTOZSB(zp)->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (S_ISDIR(obj_mode) && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * every time. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode, + aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + * Also, create FUIDs for any User/Group ACEs + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over owner@, group@ or everyone@ inherit only ACEs + */ + if ((iflags & ACE_INHERIT_ONLY_ACE) && + (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + entry_type == OWNING_GROUP)) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize = 0; + int acl_count = 0; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + boolean_t drop_lock = B_FALSE; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; + } + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(ZTOZSB(zp)->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) + return (0); + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error == 0 && aclp->z_acl_count > 0) + zp->z_mode = ZTOI(zp)->i_mode = + zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), + KGID_TO_SGID(ZTOI(zp)->i_gid)); + + /* + * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL + * nor a DACL_ACES SA in which case ENOENT is returned from + * zfs_acl_node_read() when the SA can't be located. + * Allow chown/chgrp to succeed in these cases rather than + * returning an error that makes no sense in the context of + * the caller. + */ + if (error == ENOENT) + return (0); + + return (error); +} + +static void +acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1, + uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone) +{ + *deny1 = *deny2 = *allow0 = *group = 0; + + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + *deny1 |= ACE_READ_DATA; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + *deny1 |= ACE_WRITE_DATA; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + *deny1 |= ACE_EXECUTE; + + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + *deny2 = ACE_READ_DATA; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + *deny2 |= ACE_WRITE_DATA; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + *deny2 |= ACE_EXECUTE; + + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + *allow0 |= ACE_READ_DATA; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + *allow0 |= ACE_WRITE_DATA; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + *allow0 |= ACE_EXECUTE; + + *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + *owner |= ACE_READ_DATA; + if (mode & S_IWUSR) + *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA; + if (mode & S_IXUSR) + *owner |= ACE_EXECUTE; + + *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + *group |= ACE_READ_DATA; + if (mode & S_IWGRP) + *group |= ACE_WRITE_DATA|ACE_APPEND_DATA; + if (mode & S_IXGRP) + *group |= ACE_EXECUTE; + + *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + *everyone |= ACE_READ_DATA; + if (mode & S_IWOTH) + *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA; + if (mode & S_IXOTH) + *everyone |= ACE_EXECUTE; +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +static int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permissions are never set by default + */ + if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + + return (0); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); + + zp->z_mode = ZTOI(zp)->i_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + uint32_t owner, group, everyone; + uint32_t deny1, deny2, allow0; + + new_count = new_bytes = 0; + + acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2, + &owner, &group, &everyone); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (allow0) { + zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (deny1) { + zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (deny2) { + zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint16_t inherit_flags; + + entry_type = (iflags & ACE_TYPE_FLAGS); + inherit_flags = (iflags & ALL_INHERIT); + + if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE || + (entry_type == OWNING_GROUP)) && + ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) { + continue; + } + + if ((type != ALLOW && type != DENY) || + (inherit_flags & ACE_INHERIT_ONLY_ACE)) { + if (inherit_flags) + aclp->z_hints |= ZFS_INHERIT_ACE; + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + + /* + * Limit permissions to be no greater than + * group permissions + */ + if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) { + if (!(mode & S_IRGRP)) + access_mask &= ~ACE_READ_DATA; + if (!(mode & S_IWGRP)) + access_mask &= + ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + if (!(mode & S_IXGRP)) + access_mask &= ~ACE_EXECUTE; + access_mask &= + ~(ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS); + } + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +void +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOZSB(zp), mode, *aclp); + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + ASSERT(*aclp); +} + +/* + * strip off write_owner and write_acl + */ +static void +zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) +{ + uint32_t mask = aclp->z_ops->ace_mask_get(acep); + + if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) && + (aclp->z_ops->ace_type_get(acep) == ALLOW)) { + mask &= ~RESTRICTED_CLEAR; + aclp->z_ops->ace_mask_set(acep, mask); + } +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!(S_ISDIR(obj_mode) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + boolean_t vdir = S_ISDIR(obj_mode); + boolean_t vreg = S_ISREG(obj_mode); + boolean_t passthrough, passthrough_x, noallow; + + passthrough_x = + zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X; + passthrough = passthrough_x || + zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH; + noallow = + zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW; + + *need_chmod = B_TRUE; + pacep = NULL; + aclp = zfs_acl_alloc(paclp->z_version); + if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || S_ISLNK(obj_mode)) + return (aclp); + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (noallow && type == ALLOW) + continue; + + ace_size = aclp->z_ops->ace_size(pacep); + + if (!zfs_ace_can_use(obj_mode, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if (passthrough && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == + OWNING_GROUP)) && (vreg || (vdir && (iflags & + ACE_DIRECTORY_INHERIT_ACE)))) { + *need_chmod = B_FALSE; + } + + if (!vdir && passthrough_x && + ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + if (vdir) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + zfs_restricted_update(zfsvfs, aclp, acep); + continue; + } + + ASSERT(vdir); + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_acl_t *paclp; + gid_t gid = vap->va_gid; + boolean_t need_chmod = B_TRUE; + boolean_t inherited = B_FALSE; + + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = vap->va_mode; + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp, + cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + + acl_ids->z_fuid = vap->va_uid; + acl_ids->z_fgid = vap->va_gid; +#ifdef HAVE_KSID + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, + cr, ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = KGID_TO_SGID( + ZTOI(dzp)->i_gid); + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); + gid = crgetgid(cr); + } + } + } +#endif /* HAVE_KSID */ + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (S_ISDIR(vap->va_mode))) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && (S_ISDIR(ZTOI(dzp)->i_mode) && + (dzp->z_pflags & ZFS_INHERIT_ACE)) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); + if (need_chmod) { + acl_ids->z_aclp->z_hints |= S_ISDIR(vap->va_mode) ? + ZFS_ACL_AUTO_INHERIT : 0; + zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && + (!S_ISDEV(ZTOI(zp)->i_mode) || + (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Only check for READONLY on non-directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + ((!S_ISDIR(ZTOI(zp)->i_mode) && + (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (S_ISDIR(ZTOI(zp)->i_mode) && + (zp->z_pflags & ZFS_IMMUTABLE)))) { + return (SET_ERROR(EPERM)); + } + + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCES if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (S_ISDIR(ZTOI(zp)->i_mode) && + (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(ZTOZSB(zp), + KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (S_ISDIR(ZTOI(zdp)->i_mode))); + if (is_attr) + goto slow; + + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 || + KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(ZTOZSB(zdp)); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(ZTOZSB(zdp)); + return (error); +} + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode)); + + /* + * If attribute then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(ZTOZSB(zp), + zp->z_xattr_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + + owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), + cr, ZFS_OWNER); + /* + * Map the bits required to the standard inode flags + * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits + * mapped by working_mode (currently missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= S_IRUSR; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= S_IWUSR; + if (working_mode & ACE_EXECUTE) + needed_bits |= S_IXUSR; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + iput(ZTOI(xzp)); + return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + iput(ZTOI(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= S_IRUSR; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= S_IWUSR; + if (working_mode & ACE_EXECUTE) + checkmode |= S_IXUSR; + + error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOI(zp), owner, + needed_bits, needed_bits); + } + + if (is_attr) + iput(ZTOI(xzp)); + + return (error); +} + +/* + * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t available_perms, cred_t *cr) +{ + int error; + uid_t downer; + + downer = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid), + cr, ZFS_OWNER); + + error = secpolicy_vnode_access2(cr, ZTOI(dzp), + downer, available_perms, S_IWUSR|S_IXUSR); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t available_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) + return (secpolicy_vnode_remove(cr)); + + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + + if (dzp_error != 0 && !dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : S_IWUSR; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : S_IXUSR; + + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = S_ISDIR(ZTOI(szp)->i_mode) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + */ + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c new file mode 100644 index 000000000..1e61ef06d --- /dev/null +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -0,0 +1,1240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri <[email protected]> + * Brian Behlendorf <[email protected]> + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2018 George Melikov. All Rights Reserved. + * Copyright (c) 2019 Datto, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' and 'shares' directory, but this may + * expand in the future. The elements are built dynamically, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding inode. + * + * All mounts are handled automatically by an user mode helper which invokes + * the mount procedure. Unmounts are handled by allowing the mount + * point to expire so the kernel may automatically unmount it. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same + * zfsvfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths + * (ie: snapshots) are complete ZFS filesystems and have their own unique + * zfsvfs_t. However, the fsid reported by these mounts will be the same + * as that used by the parent zfsvfs_t to make NFS happy. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/pathname.h> +#include <sys/vfs.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_vnops.h> +#include <sys/stat.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dsl_destroy.h> +#include <sys/dsl_deleg.h> +#include <sys/zpl.h> +#include <sys/mntent.h> +#include "zfs_namecheck.h" + +/* + * Two AVL trees are maintained which contain all currently automounted + * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t + * entry which MUST: + * + * - be attached to both trees, and + * - be unique, no duplicate entries are allowed. + * + * The zfs_snapshots_by_name tree is indexed by the full dataset name + * while the zfs_snapshots_by_objsetid tree is indexed by the unique + * objsetid. This allows for fast lookups either by name or objsetid. + */ +static avl_tree_t zfs_snapshots_by_name; +static avl_tree_t zfs_snapshots_by_objsetid; +static krwlock_t zfs_snapshot_lock; + +/* + * Control Directory Tunables (.zfs) + */ +int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; +int zfs_admin_snapshot = 0; + +typedef struct { + char *se_name; /* full snapshot name */ + char *se_path; /* full mount path */ + spa_t *se_spa; /* pool spa */ + uint64_t se_objsetid; /* snapshot objset id */ + struct dentry *se_root_dentry; /* snapshot root dentry */ + taskqid_t se_taskqid; /* scheduled unmount taskqid */ + avl_node_t se_node_name; /* zfs_snapshots_by_name link */ + avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ + zfs_refcount_t se_refcount; /* reference count */ +} zfs_snapentry_t; + +static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); + +/* + * Allocate a new zfs_snapentry_t being careful to make a copy of the + * the snapshot name and provided mount point. No reference is taken. + */ +static zfs_snapentry_t * +zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa, + uint64_t objsetid, struct dentry *root_dentry) +{ + zfs_snapentry_t *se; + + se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); + + se->se_name = strdup(full_name); + se->se_path = strdup(full_path); + se->se_spa = spa; + se->se_objsetid = objsetid; + se->se_root_dentry = root_dentry; + se->se_taskqid = TASKQID_INVALID; + + zfs_refcount_create(&se->se_refcount); + + return (se); +} + +/* + * Free a zfs_snapentry_t the caller must ensure there are no active + * references. + */ +static void +zfsctl_snapshot_free(zfs_snapentry_t *se) +{ + zfs_refcount_destroy(&se->se_refcount); + strfree(se->se_name); + strfree(se->se_path); + + kmem_free(se, sizeof (zfs_snapentry_t)); +} + +/* + * Hold a reference on the zfs_snapentry_t. + */ +static void +zfsctl_snapshot_hold(zfs_snapentry_t *se) +{ + zfs_refcount_add(&se->se_refcount, NULL); +} + +/* + * Release a reference on the zfs_snapentry_t. When the number of + * references drops to zero the structure will be freed. + */ +static void +zfsctl_snapshot_rele(zfs_snapentry_t *se) +{ + if (zfs_refcount_remove(&se->se_refcount, NULL) == 0) + zfsctl_snapshot_free(se); +} + +/* + * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and + * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part + * of the trees a reference is held. + */ +static void +zfsctl_snapshot_add(zfs_snapentry_t *se) +{ + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + zfsctl_snapshot_hold(se); + avl_add(&zfs_snapshots_by_name, se); + avl_add(&zfs_snapshots_by_objsetid, se); +} + +/* + * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and + * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped, + * this can result in the structure being freed if that was the last + * remaining reference. + */ +static void +zfsctl_snapshot_remove(zfs_snapentry_t *se) +{ + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + avl_remove(&zfs_snapshots_by_name, se); + avl_remove(&zfs_snapshots_by_objsetid, se); + zfsctl_snapshot_rele(se); +} + +/* + * Snapshot name comparison function for the zfs_snapshots_by_name. + */ +static int +snapentry_compare_by_name(const void *a, const void *b) +{ + const zfs_snapentry_t *se_a = a; + const zfs_snapentry_t *se_b = b; + int ret; + + ret = strcmp(se_a->se_name, se_b->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +/* + * Snapshot name comparison function for the zfs_snapshots_by_objsetid. + */ +static int +snapentry_compare_by_objsetid(const void *a, const void *b) +{ + const zfs_snapentry_t *se_a = a; + const zfs_snapentry_t *se_b = b; + + if (se_a->se_spa != se_b->se_spa) + return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); + + if (se_a->se_objsetid < se_b->se_objsetid) + return (-1); + else if (se_a->se_objsetid > se_b->se_objsetid) + return (1); + else + return (0); +} + +/* + * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname + * is found a pointer to the zfs_snapentry_t is returned and a reference + * taken on the structure. The caller is responsible for dropping the + * reference with zfsctl_snapshot_rele(). If the snapname is not found + * NULL will be returned. + */ +static zfs_snapentry_t * +zfsctl_snapshot_find_by_name(char *snapname) +{ + zfs_snapentry_t *se, search; + + ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); + + search.se_name = snapname; + se = avl_find(&zfs_snapshots_by_name, &search, NULL); + if (se) + zfsctl_snapshot_hold(se); + + return (se); +} + +/* + * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id + * rather than the snapname. In all other respects it behaves the same + * as zfsctl_snapshot_find_by_name(). + */ +static zfs_snapentry_t * +zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) +{ + zfs_snapentry_t *se, search; + + ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); + + search.se_spa = spa; + search.se_objsetid = objsetid; + se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); + if (se) + zfsctl_snapshot_hold(se); + + return (se); +} + +/* + * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is + * removed, renamed, and added back to the new correct location in the tree. + */ +static int +zfsctl_snapshot_rename(char *old_snapname, char *new_snapname) +{ + zfs_snapentry_t *se; + + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + + se = zfsctl_snapshot_find_by_name(old_snapname); + if (se == NULL) + return (SET_ERROR(ENOENT)); + + zfsctl_snapshot_remove(se); + strfree(se->se_name); + se->se_name = strdup(new_snapname); + zfsctl_snapshot_add(se); + zfsctl_snapshot_rele(se); + + return (0); +} + +/* + * Delayed task responsible for unmounting an expired automounted snapshot. + */ +static void +snapentry_expire(void *data) +{ + zfs_snapentry_t *se = (zfs_snapentry_t *)data; + spa_t *spa = se->se_spa; + uint64_t objsetid = se->se_objsetid; + + if (zfs_expire_snapshot <= 0) { + zfsctl_snapshot_rele(se); + return; + } + + se->se_taskqid = TASKQID_INVALID; + (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); + zfsctl_snapshot_rele(se); + + /* + * Reschedule the unmount if the zfs_snapentry_t wasn't removed. + * This can occur when the snapshot is busy. + */ + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { + zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); + zfsctl_snapshot_rele(se); + } + rw_exit(&zfs_snapshot_lock); +} + +/* + * Cancel an automatic unmount of a snapname. This callback is responsible + * for dropping the reference on the zfs_snapentry_t which was taken when + * during dispatch. + */ +static void +zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) +{ + if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { + se->se_taskqid = TASKQID_INVALID; + zfsctl_snapshot_rele(se); + } +} + +/* + * Dispatch the unmount task for delayed handling with a hold protecting it. + */ +static void +zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) +{ + ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); + + if (delay <= 0) + return; + + zfsctl_snapshot_hold(se); + se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, + snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); +} + +/* + * Schedule an automatic unmount of objset id to occur in delay seconds from + * now. Any previous delayed unmount will be cancelled in favor of the + * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() + * and held until the outstanding task is handled or cancelled. + */ +int +zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) +{ + zfs_snapentry_t *se; + int error = ENOENT; + + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { + zfsctl_snapshot_unmount_cancel(se); + zfsctl_snapshot_unmount_delay_impl(se, delay); + zfsctl_snapshot_rele(se); + error = 0; + } + rw_exit(&zfs_snapshot_lock); + + return (error); +} + +/* + * Check if snapname is currently mounted. Returned non-zero when mounted + * and zero when unmounted. + */ +static boolean_t +zfsctl_snapshot_ismounted(char *snapname) +{ + zfs_snapentry_t *se; + boolean_t ismounted = B_FALSE; + + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) { + zfsctl_snapshot_rele(se); + ismounted = B_TRUE; + } + rw_exit(&zfs_snapshot_lock); + + return (ismounted); +} + +/* + * Check if the given inode is a part of the virtual .zfs directory. + */ +boolean_t +zfsctl_is_node(struct inode *ip) +{ + return (ITOZ(ip)->z_is_ctldir); +} + +/* + * Check if the given inode is a .zfs/snapshots/snapname directory. + */ +boolean_t +zfsctl_is_snapdir(struct inode *ip) +{ + return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); +} + +/* + * Allocate a new inode with the passed id and ops. + */ +static struct inode * +zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, + const struct file_operations *fops, const struct inode_operations *ops) +{ + inode_timespec_t now; + struct inode *ip; + znode_t *zp; + + ip = new_inode(zfsvfs->z_sb); + if (ip == NULL) + return (NULL); + + now = current_time(ip); + zp = ITOZ(ip); + ASSERT3P(zp->z_dirlocks, ==, NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); + zp->z_id = id; + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_zn_prefetch = B_FALSE; + zp->z_moved = B_FALSE; + zp->z_is_sa = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_TRUE; + zp->z_is_stale = B_FALSE; + zp->z_sa_hdl = NULL; + zp->z_blksz = 0; + zp->z_seq = 0; + zp->z_mapcnt = 0; + zp->z_size = 0; + zp->z_pflags = 0; + zp->z_mode = 0; + zp->z_sync_cnt = 0; + ip->i_generation = 0; + ip->i_ino = id; + ip->i_mode = (S_IFDIR | S_IRWXUGO); + ip->i_uid = SUID_TO_KUID(0); + ip->i_gid = SGID_TO_KGID(0); + ip->i_blkbits = SPA_MINBLOCKSHIFT; + ip->i_atime = now; + ip->i_mtime = now; + ip->i_ctime = now; + ip->i_fop = fops; + ip->i_op = ops; +#if defined(IOP_XATTR) + ip->i_opflags &= ~IOP_XATTR; +#endif + + if (insert_inode_locked(ip)) { + unlock_new_inode(ip); + iput(ip); + return (NULL); + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + mutex_exit(&zfsvfs->z_znodes_lock); + + unlock_new_inode(ip); + + return (ip); +} + +/* + * Lookup the inode with given id, it will be allocated if needed. + */ +static struct inode * +zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, + const struct file_operations *fops, const struct inode_operations *ops) +{ + struct inode *ip = NULL; + + while (ip == NULL) { + ip = ilookup(zfsvfs->z_sb, (unsigned long)id); + if (ip) + break; + + /* May fail due to concurrent zfsctl_inode_alloc() */ + ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); + } + + return (ip); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. All other entities + * under the '.zfs' directory are created dynamically as needed. + * + * Because the dynamically created '.zfs' directory entries assume the use + * of 64-bit inode numbers this support must be disabled on 32-bit systems. + */ +int +zfsctl_create(zfsvfs_t *zfsvfs) +{ + ASSERT(zfsvfs->z_ctldir == NULL); + + zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, + &zpl_fops_root, &zpl_ops_root); + if (zfsvfs->z_ctldir == NULL) + return (SET_ERROR(ENOENT)); + + return (0); +} + +/* + * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. + * Only called when the filesystem is unmounted. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + if (zfsvfs->z_issnap) { + zfs_snapentry_t *se; + spa_t *spa = zfsvfs->z_os->os_spa; + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + + rw_enter(&zfs_snapshot_lock, RW_WRITER); + se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); + if (se != NULL) + zfsctl_snapshot_remove(se); + rw_exit(&zfs_snapshot_lock); + if (se != NULL) { + zfsctl_snapshot_unmount_cancel(se); + zfsctl_snapshot_rele(se); + } + } else if (zfsvfs->z_ctldir) { + iput(zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; + } +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +struct inode * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + igrab(ZTOZSB(zp)->z_ctldir); + return (ZTOZSB(zp)->z_ctldir); +} + +/* + * Generate a long fid to indicate a snapdir. We encode whether snapdir is + * already mounted in gen field. We do this because nfsd lookup will not + * trigger automount. Next time the nfsd does fh_to_dentry, we will notice + * this and do automount and return ESTALE to force nfsd revalidate and follow + * mount. + */ +static int +zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) +{ + zfid_short_t *zfid = (zfid_short_t *)fidp; + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint32_t gen = 0; + uint64_t object; + uint64_t objsetid; + int i; + struct dentry *dentry; + + if (fidp->fid_len < LONG_FID_LEN) { + fidp->fid_len = LONG_FID_LEN; + return (SET_ERROR(ENOSPC)); + } + + object = ip->i_ino; + objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; + zfid->zf_len = LONG_FID_LEN; + + dentry = d_obtain_alias(igrab(ip)); + if (!IS_ERR(dentry)) { + gen = !!d_mountpoint(dentry); + dput(dentry); + } + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + + return (0); +} + +/* + * Generate an appropriate fid for an entry in the .zfs directory. + */ +int +zfsctl_fid(struct inode *ip, fid_t *fidp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + if (zfsctl_is_snapdir(ip)) { + ZFS_EXIT(zfsvfs); + return (zfsctl_snapdir_fid(ip, fidp)); + } + + if (fidp->fid_len < SHORT_FID_LEN) { + fidp->fid_len = SHORT_FID_LEN; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOSPC)); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Construct a full dataset name in full_name: "pool/dataset@snap_name" + */ +static int +zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, + char *full_name) +{ + objset_t *os = zfsvfs->z_os; + + if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) + return (SET_ERROR(EILSEQ)); + + dmu_objset_name(os, full_name); + if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) + return (SET_ERROR(ENAMETOOLONG)); + + (void) strcat(full_name, "@"); + (void) strcat(full_name, snap_name); + + return (0); +} + +/* + * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" + */ +static int +zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, + int path_len, char *full_path) +{ + objset_t *os = zfsvfs->z_os; + fstrans_cookie_t cookie; + char *snapname; + boolean_t case_conflict; + uint64_t id, pos = 0; + int error = 0; + + if (zfsvfs->z_vfs->vfs_mntpoint == NULL) + return (SET_ERROR(ENOENT)); + + cookie = spl_fstrans_mark(); + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + while (error == 0) { + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, + ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, + &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto out; + + if (id == objsetid) + break; + } + + snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, snapname); +out: + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + spl_fstrans_unmark(cookie); + + return (error); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + int error = 0; + + ZFS_ENTER(zfsvfs); + + if (strcmp(name, "..") == 0) { + *ipp = dip->i_sb->s_root->d_inode; + } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { + *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, + &zpl_fops_snapdir, &zpl_ops_snapdir); + } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { + *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES, + &zpl_fops_shares, &zpl_ops_shares); + } else { + *ipp = NULL; + } + + if (*ipp == NULL) + error = SET_ERROR(ENOENT); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem inode as necessary. + */ +int +zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + uint64_t id; + int error; + + ZFS_ENTER(zfsvfs); + + error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, + &simple_dir_operations, &simple_dir_inode_operations); + if (*ipp == NULL) + error = SET_ERROR(ENOENT); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Renaming a directory under '.zfs/snapshot' will automatically trigger + * a rename of the snapshot to the new given name. The rename is confined + * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. + */ +int +zfsctl_snapdir_rename(struct inode *sdip, char *snm, + struct inode *tdip, char *tnm, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ITOZSB(sdip); + char *to, *from, *real, *fsname; + int error; + + if (!zfs_admin_snapshot) + return (SET_ERROR(EACCES)); + + ZFS_ENTER(zfsvfs); + + to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + error = dmu_snapshot_realname(zfsvfs->z_os, snm, real, + ZFS_MAX_DATASET_NAME_LEN, NULL); + if (error == 0) { + snm = real; + } else if (error != ENOTSUP) { + goto out; + } + } + + dmu_objset_name(zfsvfs->z_os, fsname); + + error = zfsctl_snapshot_name(ITOZSB(sdip), snm, + ZFS_MAX_DATASET_NAME_LEN, from); + if (error == 0) + error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, + ZFS_MAX_DATASET_NAME_LEN, to); + if (error == 0) + error = zfs_secpolicy_rename_perms(from, to, cr); + if (error != 0) + goto out; + + /* + * Cannot move snapshots out of the snapdir. + */ + if (sdip != tdip) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * No-op when names are identical. + */ + if (strcmp(snm, tnm) == 0) { + error = 0; + goto out; + } + + rw_enter(&zfs_snapshot_lock, RW_WRITER); + + error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); + if (error == 0) + (void) zfsctl_snapshot_rename(snm, tnm); + + rw_exit(&zfs_snapshot_lock); +out: + kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Removing a directory under '.zfs/snapshot' will automatically trigger + * the removal of the snapshot with the given name. + */ +int +zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + char *snapname, *real; + int error; + + if (!zfs_admin_snapshot) + return (SET_ERROR(EACCES)); + + ZFS_ENTER(zfsvfs); + + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + error = dmu_snapshot_realname(zfsvfs->z_os, name, real, + ZFS_MAX_DATASET_NAME_LEN, NULL); + if (error == 0) { + name = real; + } else if (error != ENOTSUP) { + goto out; + } + } + + error = zfsctl_snapshot_name(ITOZSB(dip), name, + ZFS_MAX_DATASET_NAME_LEN, snapname); + if (error == 0) + error = zfs_secpolicy_destroy_perms(snapname, cr); + if (error != 0) + goto out; + + error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); + if ((error == 0) || (error == ENOENT)) + error = dsl_destroy_snapshot(snapname, B_FALSE); +out: + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Creating a directory under '.zfs/snapshot' will automatically trigger + * the creation of a new snapshot with the given name. + */ +int +zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, + struct inode **ipp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + char *dsname; + int error; + + if (!zfs_admin_snapshot) + return (SET_ERROR(EACCES)); + + dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { + error = SET_ERROR(EILSEQ); + goto out; + } + + dmu_objset_name(zfsvfs->z_os, dsname); + + error = zfs_secpolicy_snapshot_perms(dsname, cr); + if (error != 0) + goto out; + + if (error == 0) { + error = dmu_objset_snapshot_one(dsname, dirname); + if (error != 0) + goto out; + + error = zfsctl_snapdir_lookup(dip, dirname, ipp, + 0, cr, NULL, NULL); + } +out: + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + return (error); +} + +/* + * Attempt to unmount a snapshot by making a call to user space. + * There is no assurance that this can or will succeed, is just a + * best effort. In the case where it does fail, perhaps because + * it's in use, the unmount will fail harmlessly. + */ +int +zfsctl_snapshot_unmount(char *snapname, int flags) +{ + char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, + NULL }; + char *envp[] = { NULL }; + zfs_snapentry_t *se; + int error; + + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { + rw_exit(&zfs_snapshot_lock); + return (SET_ERROR(ENOENT)); + } + rw_exit(&zfs_snapshot_lock); + + if (flags & MNT_FORCE) + argv[4] = "-fn"; + argv[5] = se->se_path; + dprintf("unmount; path=%s\n", se->se_path); + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + zfsctl_snapshot_rele(se); + + + /* + * The umount system utility will return 256 on error. We must + * assume this error is because the file system is busy so it is + * converted to the more sensible EBUSY. + */ + if (error) + error = SET_ERROR(EBUSY); + + return (error); +} + +int +zfsctl_snapshot_mount(struct path *path, int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *ip = dentry->d_inode; + zfsvfs_t *zfsvfs; + zfsvfs_t *snap_zfsvfs; + zfs_snapentry_t *se; + char *full_name, *full_path; + char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, + NULL }; + char *envp[] = { NULL }; + int error; + struct path spath; + + if (ip == NULL) + return (SET_ERROR(EISDIR)); + + zfsvfs = ITOZSB(ip); + ZFS_ENTER(zfsvfs); + + full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + error = zfsctl_snapshot_name(zfsvfs, dname(dentry), + ZFS_MAX_DATASET_NAME_LEN, full_name); + if (error) + goto error; + + /* + * Construct a mount point path from sb of the ctldir inode and dirent + * name, instead of from d_path(), so that chroot'd process doesn't fail + * on mount.zfs(8). + */ + snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, dname(dentry)); + + /* + * Multiple concurrent automounts of a snapshot are never allowed. + * The snapshot may be manually mounted as many times as desired. + */ + if (zfsctl_snapshot_ismounted(full_name)) { + error = 0; + goto error; + } + + /* + * Attempt to mount the snapshot from user space. Normally this + * would be done using the vfs_kern_mount() function, however that + * function is marked GPL-only and cannot be used. On error we + * careful to log the real error to the console and return EISDIR + * to safely abort the automount. This should be very rare. + * + * If the user mode helper happens to return EBUSY, a concurrent + * mount is already in progress in which case the error is ignored. + * Take note that if the program was executed successfully the return + * value from call_usermodehelper() will be (exitcode << 8 + signal). + */ + dprintf("mount; name=%s path=%s\n", full_name, full_path); + argv[5] = full_name; + argv[6] = full_path; + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (error) { + if (!(error & MOUNT_BUSY << 8)) { + zfs_dbgmsg("Unable to automount %s error=%d", + full_path, error); + error = SET_ERROR(EISDIR); + } else { + /* + * EBUSY, this could mean a concurrent mount, or the + * snapshot has already been mounted at completely + * different place. We return 0 so VFS will retry. For + * the latter case the VFS will retry several times + * and return ELOOP, which is probably not a very good + * behavior. + */ + error = 0; + } + goto error; + } + + /* + * Follow down in to the mounted snapshot and set MNT_SHRINKABLE + * to identify this as an automounted filesystem. + */ + spath = *path; + path_get(&spath); + if (zpl_follow_down_one(&spath)) { + snap_zfsvfs = ITOZSB(spath.dentry->d_inode); + snap_zfsvfs->z_parent = zfsvfs; + dentry = spath.dentry; + spath.mnt->mnt_flags |= MNT_SHRINKABLE; + + rw_enter(&zfs_snapshot_lock, RW_WRITER); + se = zfsctl_snapshot_alloc(full_name, full_path, + snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os), + dentry); + zfsctl_snapshot_add(se); + zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); + rw_exit(&zfs_snapshot_lock); + } + path_put(&spath); +error: + kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(full_path, MAXPATHLEN); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Get the snapdir inode from fid + */ +int +zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, + struct inode **ipp) +{ + int error; + struct path path; + char *mnt; + struct dentry *dentry; + + mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, + MAXPATHLEN, mnt); + if (error) + goto out; + + /* Trigger automount */ + error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); + if (error) + goto out; + + path_put(&path); + /* + * Get the snapdir inode. Note, we don't want to use the above + * path because it contains the root of the snapshot rather + * than the snapdir. + */ + *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid); + if (*ipp == NULL) { + error = SET_ERROR(ENOENT); + goto out; + } + + /* check gen, see zfsctl_snapdir_fid */ + dentry = d_obtain_alias(igrab(*ipp)); + if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) { + iput(*ipp); + *ipp = NULL; + error = SET_ERROR(ENOENT); + } + if (!IS_ERR(dentry)) + dput(dentry); +out: + kmem_free(mnt, MAXPATHLEN); + return (error); +} + +int +zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + struct inode *ip; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL); + iput(ZTOI(dzp)); + } + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Initialize the various pieces we'll need to create and manipulate .zfs + * directories. Currently this is unused but available. + */ +void +zfsctl_init(void) +{ + avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, + se_node_name)); + avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, + se_node_objsetid)); + rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); +} + +/* + * Cleanup the various pieces we needed for .zfs directories. In particular + * ensure the expiry timer is canceled safely. + */ +void +zfsctl_fini(void) +{ + avl_destroy(&zfs_snapshots_by_name); + avl_destroy(&zfs_snapshots_by_objsetid); + rw_destroy(&zfs_snapshot_lock); +} + +module_param(zfs_admin_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); + +module_param(zfs_expire_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c new file mode 100644 index 000000000..538533d27 --- /dev/null +++ b/module/os/linux/zfs/zfs_debug.c @@ -0,0 +1,253 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include <sys/zfs_context.h> + +typedef struct zfs_dbgmsg { + procfs_list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +procfs_list_t zfs_dbgmsgs; +int zfs_dbgmsg_size = 0; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ + +/* + * Internal ZFS debug messages are enabled by default. + * + * # Print debug messages + * cat /proc/spl/kstat/zfs/dbgmsg + * + * # Disable the kernel debug message log. + * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable + * + * # Clear the kernel debug message log. + * echo 0 >/proc/spl/kstat/zfs/dbgmsg + */ +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_show_header(struct seq_file *f) +{ + seq_printf(f, "%-12s %-8s\n", "timestamp", "message"); + return (0); +} + +static int +zfs_dbgmsg_show(struct seq_file *f, void *p) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p; + seq_printf(f, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + return (0); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + while (zfs_dbgmsg_size > max_size) { + zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list); + if (zdm == NULL) + return; + + int size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_clear(procfs_list_t *procfs_list) +{ + mutex_enter(&zfs_dbgmsgs.pl_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs.pl_lock); + return (0); +} + +void +zfs_dbgmsg_init(void) +{ + procfs_list_install("zfs", + "dbgmsg", + 0600, + &zfs_dbgmsgs, + zfs_dbgmsg_show, + zfs_dbgmsg_show_header, + zfs_dbgmsg_clear, + offsetof(zfs_dbgmsg_t, zdm_node)); +} + +void +zfs_dbgmsg_fini(void) +{ + procfs_list_uninstall(&zfs_dbgmsgs); + zfs_dbgmsg_purge(0); + + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL + procfs_list_destroy(&zfs_dbgmsgs); +#endif +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +void +__zfs_dbgmsg(char *buf) +{ + int size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs.pl_lock); + procfs_list_add(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs.pl_lock); +} + +#ifdef _KERNEL + +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + size_t size; + char *buf; + char *nl; + int i; + char *prefix = (dprint) ? "dprintf: " : ""; + + size = 1024; + buf = kmem_alloc(size, KM_SLEEP); + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + /* + * Get rid of trailing newline for dprintf logs. + */ + if (dprint && buf[0] != '\0') { + nl = &buf[strlen(buf) - 1]; + if (*nl == '\n') + *nl = '\0'; + } + + /* + * To get this data enable the zfs__dprintf trace point as shown: + * + * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer + * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable + * $ echo 0 > /sys/kernel/debug/tracing/trace + * + * # Dump the ring buffer. + * $ cat /sys/kernel/debug/tracing/trace + */ + DTRACE_PROBE1(zfs__dprintf, char *, buf); + + /* + * To get this data: + * + * $ cat /proc/spl/kstat/zfs/dbgmsg + * + * To clear the buffer: + * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg + */ + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +#else + +void +zfs_dbgmsg_print(const char *tag) +{ + ssize_t ret __attribute__((unused)); + + /* + * We use write() in this function instead of printf() + * so it is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") START:\n", 9); + + mutex_enter(&zfs_dbgmsgs.pl_lock); + for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; + zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { + ret = write(STDOUT_FILENO, zdm->zdm_msg, + strlen(zdm->zdm_msg)); + ret = write(STDOUT_FILENO, "\n", 1); + } + + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") END\n", 6); + + mutex_exit(&zfs_dbgmsgs.pl_lock); +} +#endif /* _KERNEL */ + +#ifdef _KERNEL +module_param(zfs_dbgmsg_enable, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); + +module_param(zfs_dbgmsg_maxsize, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); +#endif diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c new file mode 100644 index 000000000..6bdad737c --- /dev/null +++ b/module/os/linux/zfs/zfs_dir.c @@ -0,0 +1,1205 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/mode.h> +#include <sys/kmem.h> +#include <sys/uio.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/stat.h> +#include <sys/sunddi.h> +#include <sys/random.h> +#include <sys/policy.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_vnops.h> +#include <sys/fs/zfs.h> +#include <sys/zap.h> +#include <sys/dmu.h> +#include <sys/atomic.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/sa.h> +#include <sys/zfs_sa.h> + +/* + * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, + boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +{ + boolean_t conflict = B_FALSE; + int error; + + if (zfsvfs->z_norm) { + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->pn_buf; + bufsz = rpnp->pn_bufsize; + } + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (error == EOVERFLOW) + error = 0; + + if (zfsvfs->z_norm && !error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on <dzp, name> protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * ZHAVELOCK: Don't grab the z_name_lock for this call. The + * current thread already holds it. + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t *dl; + boolean_t update; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if ((name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + (zfsvfs->z_case == ZFS_CASE_MIXED && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + * + * Don't grab the lock if it is already held. However, cannot + * have both ZSHARED and ZHAVELOCK together. + */ + ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); + if (!(flag & ZHAVELOCK)) + rw_enter(&dzp->z_name_lock, RW_READER); + + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked && !(flag & ZXATTR)) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namelock = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + /* + * If the z_name_lock was NOT held for this dirlock record it. + */ + if (flag & ZHAVELOCK) + dl->dl_namelock = 1; + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. It'll either see the old value, + * which belongs to it, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, + update, direntflags, realpnp, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + + if (!dl->dl_namelock) + rw_exit(&dzp->z_name_lock); + + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags, + int *deflg, pathname_t *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + int error = 0; + uint64_t parent; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *ipp = ZTOI(dzp); + igrab(*ipp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + + /* + * If we are a snapshot mounted under .zfs, return + * the inode pointer for the snapshot directory. + */ + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", ipp, 0, kcred, NULL, NULL); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *ipp = ZTOI(zp); + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + *ipp = zfsctl_root(dzp); + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *ipp = ZTOI(zp); + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); + + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + ASSERT(zp->z_unlinked); + ASSERT(ZTOI(zp)->i_nlink == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +static void +zfs_unlinked_drain_task(void *arg) +{ + zfsvfs_t *zfsvfs = arg; + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + + /* + * iput() is Linux's equivalent to illumos' VN_RELE(). It will + * decrement the inode's ref count and may cause the inode to be + * synchronously freed. We interrupt freeing of this inode, by + * checking the return value of dmu_objset_zfs_unmounting() in + * dmu_free_long_range(), when an unmount is requested. + */ + iput(ZTOI(zp)); + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + } + zap_cursor_fini(&zc); + + zfsvfs->z_draining = B_FALSE; + zfsvfs->z_drain_task = TASKQID_INVALID; +} + +/* + * Sets z_draining then tries to dispatch async unlinked drain. + * If that fails executes synchronous unlinked drain. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); + + zfsvfs->z_draining = B_TRUE; + zfsvfs->z_drain_cancel = B_FALSE; + + zfsvfs->z_drain_task = taskq_dispatch( + dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), + zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); + if (zfsvfs->z_drain_task == TASKQID_INVALID) { + zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); + zfs_unlinked_drain_task(zfsvfs); + } +} + +/* + * Wait for the unlinked drain taskq task to stop. This will interrupt the + * unlinked set processing if it is in progress. + */ +void +zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + if (zfsvfs->z_draining) { + zfsvfs->z_drain_cancel = B_TRUE; + taskq_cancel_id(dsl_pool_unlinked_drain_taskq( + dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); + zfsvfs->z_drain_task = TASKQID_INVALID; + zfsvfs->z_draining = B_FALSE; + } +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || + S_ISLNK(ZTOI(xzp)->i_mode)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_iput_async(ZTOI(xzp)); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + zfs_iput_async(ZTOI(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + uint64_t links; + int error; + + ASSERT(ZTOI(zp)->i_nlink == 0); + ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); + + /* + * If this is an attribute directory, purge its contents. + */ + if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + + return; + } + } + + /* + * Free up all the data in the file. We don't do this for directories + * because we need truncate and remove to be in the same tx, like in + * zfs_znode_delete(). Otherwise, if we crash here we'll end up with + * an inconsistent truncated zap object in the delete queue. Note a + * truncated file is harmless since it only contains user data. + */ + if (S_ISREG(ZTOI(zp)->i_mode)) { + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT(error == 0); + } + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + goto out; + } + + if (xzp) { + ASSERT(error == 0); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + clear_nlink(ZTOI(xzp)); /* no more links to it */ + links = 0; + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx)); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) + zfs_iput_async(ZTOI(xzp)); +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can fail in the following cases : + * - if zp has been unlinked. + * - if the number of entries with the same hash (aka. colliding entries) + * exceed the capacity of a leaf-block of fatzap and splitting of the + * leaf-block does not help. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t value; + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOENT)); + } + if (!(flag & ZNEW)) { + /* + * ZNEW nodes come from zfs_mknode() where the link + * count has already been initialised + */ + inc_nlink(ZTOI(zp)); + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + } + } + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, + &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + drop_nlink(ZTOI(zp)); + mutex_exit(&zp->z_lock); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + mutex_exit(&zp->z_lock); + + mutex_enter(&dzp->z_lock); + dzp->z_size++; + if (zp_is_dir) + inc_nlink(ZTOI(dzp)); + links = ZTOI(dzp)->i_nlink; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &links, sizeof (links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (ZTOZSB(zp)->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && + !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, + dl->dl_name, mt, tx); + } else { + error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, + tx); + } + + return (error); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. Can + * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + if (!(flag & ZRENAMING)) { + mutex_enter(&zp->z_lock); + + if (zp_is_dir && !zfs_dirempty(zp)) { + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOTEMPTY)); + } + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) { + mutex_exit(&zp->z_lock); + return (error); + } + + if (ZTOI(zp)->i_nlink <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); + set_nlink(ZTOI(zp), zp_is_dir + 1); + } + drop_nlink(ZTOI(zp)); + if (ZTOI(zp)->i_nlink == zp_is_dir) { + zp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(zp)); + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); + mutex_exit(&zp->z_lock); + } else { + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) + return (error); + } + + mutex_enter(&dzp->z_lock); + dzp->z_size--; /* one dirent removed */ + if (zp_is_dir) + drop_nlink(ZTOI(dzp)); /* ".." link from zp */ + links = ZTOI(dzp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + * + * The internal ZAP size, rather than zp->z_size, needs to be checked since + * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + uint64_t count; + int error; + + if (dzp->z_dirlocks != NULL) + return (B_FALSE); + + error = zap_count(zfsvfs->z_os, dzp->z_id, &count); + if (error != 0 || count != 0) + return (B_FALSE); + + return (B_TRUE); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; +#ifdef DEBUG + uint64_t parent; +#endif + + *xipp = NULL; + + if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) + return (error); + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + if (!zp->z_unlinked) + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + *xipp = ZTOI(xzp); + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xipp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xipp = ZTOI(xzp); + zfs_dirent_unlock(dl); + return (0); + } + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(ENOENT)); + } + + if (zfs_is_readonly(zfsvfs)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + va.va_dentry = NULL; + error = zfs_make_xattrdir(zp, &va, xipp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * you have write access to the entry, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + + if (zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), + cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), + cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0) + return (0); + else + return (secpolicy_vnode_remove(cr)); +} diff --git a/module/os/linux/zfs/zfs_sysfs.c b/module/os/linux/zfs/zfs_sysfs.c new file mode 100644 index 000000000..bb7f3b69a --- /dev/null +++ b/module/os/linux/zfs/zfs_sysfs.c @@ -0,0 +1,661 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/zfeature.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_sysfs.h> +#include <sys/kmem.h> +#include <sys/fs/zfs.h> +#include <linux/kobject.h> + +#include "zfs_prop.h" + +#if !defined(_KERNEL) +#error kernel builds only +#endif + +/* + * ZFS Module sysfs support + * + * This extends our sysfs '/sys/module/zfs' entry to include feature + * and property attributes. The primary consumer of this information + * is user processes, like the zfs CLI, that need to know what the + * current loaded ZFS module supports. The libzfs binary will consult + * this information when instantiating the zfs|zpool property tables + * and the pool features table. + * + * The added top-level directories are: + * /sys/module/zfs + * ├── features.kernel + * ├── features.pool + * ├── properties.dataset + * └── properties.pool + * + * The local interface for the zfs kobjects includes: + * zfs_kobj_init() + * zfs_kobj_add() + * zfs_kobj_release() + * zfs_kobj_add_attr() + * zfs_kobj_fini() + */ + +/* + * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs' + */ +struct zfs_mod_kobj; +typedef struct zfs_mod_kobj zfs_mod_kobj_t; + +struct zfs_mod_kobj { + struct kobject zko_kobj; + struct kobj_type zko_kobj_type; + struct sysfs_ops zko_sysfs_ops; + size_t zko_attr_count; + struct attribute *zko_attr_list; /* allocated */ + struct attribute **zko_default_attrs; /* allocated */ + size_t zko_child_count; + zfs_mod_kobj_t *zko_children; /* allocated */ +}; + +#define ATTR_TABLE_SIZE(cnt) (sizeof (struct attribute) * (cnt)) +/* Note +1 for NULL terminator slot */ +#define DEFAULT_ATTR_SIZE(cnt) (sizeof (struct attribute *) * (cnt + 1)) +#define CHILD_TABLE_SIZE(cnt) (sizeof (zfs_mod_kobj_t) * (cnt)) + +/* + * These are the top-level kobjects under '/sys/module/zfs/' + */ +static zfs_mod_kobj_t kernel_features_kobj; +static zfs_mod_kobj_t pool_features_kobj; +static zfs_mod_kobj_t dataset_props_kobj; +static zfs_mod_kobj_t pool_props_kobj; + +/* + * The show function is used to provide the content + * of an attribute into a PAGE_SIZE buffer. + */ +typedef ssize_t (*sysfs_show_func)(struct kobject *, struct attribute *, + char *); + +static void +zfs_kobj_fini(zfs_mod_kobj_t *zkobj) +{ + /* finalize any child kobjects */ + if (zkobj->zko_child_count != 0) { + ASSERT(zkobj->zko_children); + for (int i = 0; i < zkobj->zko_child_count; i++) + zfs_kobj_fini(&zkobj->zko_children[i]); + } + + /* kobject_put() will call zfs_kobj_release() to release memory */ + kobject_del(&zkobj->zko_kobj); + kobject_put(&zkobj->zko_kobj); +} + +static void +zfs_kobj_release(struct kobject *kobj) +{ + zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj); + + if (zkobj->zko_attr_list != NULL) { + ASSERT3S(zkobj->zko_attr_count, !=, 0); + kmem_free(zkobj->zko_attr_list, + ATTR_TABLE_SIZE(zkobj->zko_attr_count)); + zkobj->zko_attr_list = NULL; + } + + if (zkobj->zko_default_attrs != NULL) { + kmem_free(zkobj->zko_default_attrs, + DEFAULT_ATTR_SIZE(zkobj->zko_attr_count)); + zkobj->zko_default_attrs = NULL; + } + + if (zkobj->zko_child_count != 0) { + ASSERT(zkobj->zko_children); + + kmem_free(zkobj->zko_children, + CHILD_TABLE_SIZE(zkobj->zko_child_count)); + zkobj->zko_child_count = 0; + zkobj->zko_children = NULL; + } + + zkobj->zko_attr_count = 0; +} + +#ifndef sysfs_attr_init +#define sysfs_attr_init(attr) do {} while (0) +#endif + +static void +zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) +{ + VERIFY3U(attr_num, <, zkobj->zko_attr_count); + ASSERT(zkobj->zko_attr_list); + ASSERT(zkobj->zko_default_attrs); + + zkobj->zko_attr_list[attr_num].name = attr_name; + zkobj->zko_attr_list[attr_num].mode = 0444; + zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num]; + sysfs_attr_init(&zkobj->zko_attr_list[attr_num]); +} + +static int +zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt, + sysfs_show_func show_func) +{ + /* + * Initialize object's attributes. Count can be zero. + */ + if (attr_cnt > 0) { + zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt), + KM_SLEEP); + if (zkobj->zko_attr_list == NULL) + return (ENOMEM); + } + /* this will always have at least one slot for NULL termination */ + zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt), + KM_SLEEP); + if (zkobj->zko_default_attrs == NULL) { + if (zkobj->zko_attr_list != NULL) { + kmem_free(zkobj->zko_attr_list, + ATTR_TABLE_SIZE(attr_cnt)); + } + return (ENOMEM); + } + zkobj->zko_attr_count = attr_cnt; + zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs; + + if (child_cnt > 0) { + zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt), + KM_SLEEP); + if (zkobj->zko_children == NULL) { + if (zkobj->zko_default_attrs != NULL) { + kmem_free(zkobj->zko_default_attrs, + DEFAULT_ATTR_SIZE(attr_cnt)); + } + if (zkobj->zko_attr_list != NULL) { + kmem_free(zkobj->zko_attr_list, + ATTR_TABLE_SIZE(attr_cnt)); + } + return (ENOMEM); + } + zkobj->zko_child_count = child_cnt; + } + + zkobj->zko_sysfs_ops.show = show_func; + zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops; + zkobj->zko_kobj_type.release = zfs_kobj_release; + + return (0); +} + +static int +zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) +{ + /* zko_default_attrs must be NULL terminated */ + ASSERT(zkobj->zko_default_attrs != NULL); + ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL); + + kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type); + return (kobject_add(&zkobj->zko_kobj, parent, name)); +} + +/* + * Each zfs property has these common attributes + */ +static const char *zprop_attrs[] = { + "type", + "readonly", + "setonce", + "visible", + "values", + "default", + "datasets" /* zfs properties only */ +}; + +#define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs) +#define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1) + +static const char *zprop_types[] = { + "number", + "string", + "index", +}; + +typedef struct zfs_type_map { + zfs_type_t ztm_type; + const char *ztm_name; +} zfs_type_map_t; + +static zfs_type_map_t type_map[] = { + {ZFS_TYPE_FILESYSTEM, "filesystem"}, + {ZFS_TYPE_SNAPSHOT, "snapshot"}, + {ZFS_TYPE_VOLUME, "volume"}, + {ZFS_TYPE_BOOKMARK, "bookmark"} +}; + +/* + * Show the content for a zfs property attribute + */ +static ssize_t +zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, + char *buf, size_t buflen) +{ + const char *show_str; + char number[32]; + + /* For dataset properties list the dataset types that apply */ + if (strcmp(attr_name, "datasets") == 0 && + property->pd_types != ZFS_TYPE_POOL) { + int len = 0; + + for (int i = 0; i < ARRAY_SIZE(type_map); i++) { + if (type_map[i].ztm_type & property->pd_types) { + len += snprintf(buf + len, buflen - len, "%s ", + type_map[i].ztm_name); + } + } + len += snprintf(buf + len, buflen - len, "\n"); + return (len); + } + + if (strcmp(attr_name, "type") == 0) { + show_str = zprop_types[property->pd_proptype]; + } else if (strcmp(attr_name, "readonly") == 0) { + show_str = property->pd_attr == PROP_READONLY ? "1" : "0"; + } else if (strcmp(attr_name, "setonce") == 0) { + show_str = property->pd_attr == PROP_ONETIME ? "1" : "0"; + } else if (strcmp(attr_name, "visible") == 0) { + show_str = property->pd_visible ? "1" : "0"; + } else if (strcmp(attr_name, "values") == 0) { + show_str = property->pd_values ? property->pd_values : ""; + } else if (strcmp(attr_name, "default") == 0) { + switch (property->pd_proptype) { + case PROP_TYPE_NUMBER: + (void) snprintf(number, sizeof (number), "%llu", + (u_longlong_t)property->pd_numdefault); + show_str = number; + break; + case PROP_TYPE_STRING: + show_str = property->pd_strdefault ? + property->pd_strdefault : ""; + break; + case PROP_TYPE_INDEX: + if (zprop_index_to_string(property->pd_propnum, + property->pd_numdefault, &show_str, + property->pd_types) != 0) { + show_str = ""; + } + break; + default: + return (0); + } + } else { + return (0); + } + + return (snprintf(buf, buflen, "%s\n", show_str)); +} + +static ssize_t +dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj)); + zprop_desc_t *prop_tbl = zfs_prop_get_table(); + ssize_t len; + + ASSERT3U(prop, <, ZFS_NUM_PROPS); + + len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE); + + return (len); +} + +static ssize_t +pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj)); + zprop_desc_t *prop_tbl = zpool_prop_get_table(); + ssize_t len; + + ASSERT3U(prop, <, ZPOOL_NUM_PROPS); + + len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE); + + return (len); +} + +/* + * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel' + * + * This list is intended for kernel features that don't have a pool feature + * association or that extend existing user kernel interfaces. + * + * A user processes can easily check if the running zfs kernel module + * supports the new feature. + */ +static const char *zfs_kernel_features[] = { + /* --> Add new kernel features here */ + "com.delphix:vdev_initialize", + "org.zfsonlinux:vdev_trim", +}; + +#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features) + +static ssize_t +kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + if (strcmp(attr->name, "supported") == 0) + return (snprintf(buf, PAGE_SIZE, "yes\n")); + return (0); +} + +static void +kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name) +{ + zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot]; + + ASSERT3U(slot, <, KERNEL_FEATURE_COUNT); + ASSERT(name); + + int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show); + if (err) + return; + + zfs_kobj_add_attr(zfs_kobj, 0, "supported"); + + err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); + if (err) + zfs_kobj_release(&zfs_kobj->zko_kobj); +} + +static int +zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) +{ + /* + * Create a parent kobject to host kernel features. + * + * '/sys/module/zfs/features.kernel' + */ + int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT, + kernel_feature_show); + if (err) + return (err); + err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES); + if (err) { + zfs_kobj_release(&zfs_kobj->zko_kobj); + return (err); + } + + /* + * Now create a kobject for each feature. + * + * '/sys/module/zfs/features.kernel/<feature>' + */ + for (int f = 0; f < KERNEL_FEATURE_COUNT; f++) + kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]); + + return (0); +} + +/* + * Each pool feature has these common attributes + */ +static const char *pool_feature_attrs[] = { + "description", + "guid", + "uname", + "readonly_compatible", + "required_for_mos", + "activate_on_enable", + "per_dataset" +}; + +#define ZPOOL_FEATURE_ATTR_COUNT ARRAY_SIZE(pool_feature_attrs) + +/* + * Show the content for the given zfs pool feature attribute + */ +static ssize_t +pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + spa_feature_t fid; + + if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0) + return (0); + + ASSERT3U(fid, <, SPA_FEATURES); + + zfeature_flags_t flags = spa_feature_table[fid].fi_flags; + const char *show_str = NULL; + + if (strcmp(attr->name, "description") == 0) { + show_str = spa_feature_table[fid].fi_desc; + } else if (strcmp(attr->name, "guid") == 0) { + show_str = spa_feature_table[fid].fi_guid; + } else if (strcmp(attr->name, "uname") == 0) { + show_str = spa_feature_table[fid].fi_uname; + } else if (strcmp(attr->name, "readonly_compatible") == 0) { + show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0"; + } else if (strcmp(attr->name, "required_for_mos") == 0) { + show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0"; + } else if (strcmp(attr->name, "activate_on_enable") == 0) { + show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0"; + } else if (strcmp(attr->name, "per_dataset") == 0) { + show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0"; + } + if (show_str == NULL) + return (0); + + return (snprintf(buf, PAGE_SIZE, "%s\n", show_str)); +} + +static void +pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid, + const char *name) +{ + zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid]; + + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(name); + + int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0, + pool_feature_show); + if (err) + return; + + for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++) + zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]); + + err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); + if (err) + zfs_kobj_release(&zfs_kobj->zko_kobj); +} + +static int +zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) +{ + /* + * Create a parent kobject to host pool features. + * + * '/sys/module/zfs/features.pool' + */ + int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show); + if (err) + return (err); + err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES); + if (err) { + zfs_kobj_release(&zfs_kobj->zko_kobj); + return (err); + } + + /* + * Now create a kobject for each feature. + * + * '/sys/module/zfs/features.pool/<feature>' + */ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) + pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid); + + return (0); +} + +typedef struct prop_to_kobj_arg { + zprop_desc_t *p2k_table; + zfs_mod_kobj_t *p2k_parent; + sysfs_show_func p2k_show_func; + int p2k_attr_count; +} prop_to_kobj_arg_t; + +static int +zprop_to_kobj(int prop, void *args) +{ + prop_to_kobj_arg_t *data = args; + zfs_mod_kobj_t *parent = data->p2k_parent; + zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop]; + const char *name = data->p2k_table[prop].pd_name; + int err; + + ASSERT(name); + + err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0, + data->p2k_show_func); + if (err) + return (ZPROP_CONT); + + for (int i = 0; i < data->p2k_attr_count; i++) + zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]); + + err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); + if (err) + zfs_kobj_release(&zfs_kobj->zko_kobj); + + return (ZPROP_CONT); +} + +static int +zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent, + zfs_type_t type) +{ + prop_to_kobj_arg_t context; + const char *name; + int err; + + /* + * Create a parent kobject to host properties. + * + * '/sys/module/zfs/properties.<type>' + */ + if (type == ZFS_TYPE_POOL) { + name = ZFS_SYSFS_POOL_PROPERTIES; + context.p2k_table = zpool_prop_get_table(); + context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT; + context.p2k_parent = zfs_kobj; + context.p2k_show_func = pool_property_show; + err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS, + pool_property_show); + } else { + name = ZFS_SYSFS_DATASET_PROPERTIES; + context.p2k_table = zfs_prop_get_table(); + context.p2k_attr_count = ZFS_PROP_ATTR_COUNT; + context.p2k_parent = zfs_kobj; + context.p2k_show_func = dataset_property_show; + err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS, + dataset_property_show); + } + + if (err) + return (err); + + err = zfs_kobj_add(zfs_kobj, parent, name); + if (err) { + zfs_kobj_release(&zfs_kobj->zko_kobj); + return (err); + } + + /* + * Create a kobject for each property. + * + * '/sys/module/zfs/properties.<type>/<property>' + */ + (void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE, + B_FALSE, type); + + return (err); +} + +void +zfs_sysfs_init(void) +{ + struct kobject *parent; +#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE) + parent = kobject_create_and_add("zfs", fs_kobj); +#else + parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj; +#endif + int err; + + if (parent == NULL) + return; + + err = zfs_kernel_features_init(&kernel_features_kobj, parent); + if (err) + return; + + err = zfs_pool_features_init(&pool_features_kobj, parent); + if (err) { + zfs_kobj_fini(&kernel_features_kobj); + return; + } + + err = zfs_sysfs_properties_init(&pool_props_kobj, parent, + ZFS_TYPE_POOL); + if (err) { + zfs_kobj_fini(&kernel_features_kobj); + zfs_kobj_fini(&pool_features_kobj); + return; + } + + err = zfs_sysfs_properties_init(&dataset_props_kobj, parent, + ZFS_TYPE_FILESYSTEM); + if (err) { + zfs_kobj_fini(&kernel_features_kobj); + zfs_kobj_fini(&pool_features_kobj); + zfs_kobj_fini(&pool_props_kobj); + return; + } +} + +void +zfs_sysfs_fini(void) +{ + /* + * Remove top-level kobjects; each will remove any children kobjects + */ + zfs_kobj_fini(&kernel_features_kobj); + zfs_kobj_fini(&pool_features_kobj); + zfs_kobj_fini(&dataset_props_kobj); + zfs_kobj_fini(&pool_props_kobj); +} diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c new file mode 100644 index 000000000..0914e4b7d --- /dev/null +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -0,0 +1,2562 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/sysmacros.h> +#include <sys/kmem.h> +#include <sys/pathname.h> +#include <sys/vnode.h> +#include <sys/vfs.h> +#include <sys/mntent.h> +#include <sys/cmn_err.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_dir.h> +#include <sys/zil.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dsl_prop.h> +#include <sys/dsl_dataset.h> +#include <sys/dsl_deleg.h> +#include <sys/spa.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/sa_impl.h> +#include <sys/policy.h> +#include <sys/atomic.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/sunddi.h> +#include <sys/dmu_objset.h> +#include <sys/spa_boot.h> +#include <sys/objlist.h> +#include <sys/zpl.h> +#include <linux/vfs_compat.h> +#include "zfs_comutil.h" + +enum { + TOKEN_RO, + TOKEN_RW, + TOKEN_SETUID, + TOKEN_NOSETUID, + TOKEN_EXEC, + TOKEN_NOEXEC, + TOKEN_DEVICES, + TOKEN_NODEVICES, + TOKEN_DIRXATTR, + TOKEN_SAXATTR, + TOKEN_XATTR, + TOKEN_NOXATTR, + TOKEN_ATIME, + TOKEN_NOATIME, + TOKEN_RELATIME, + TOKEN_NORELATIME, + TOKEN_NBMAND, + TOKEN_NONBMAND, + TOKEN_MNTPOINT, + TOKEN_LAST, +}; + +static const match_table_t zpl_tokens = { + { TOKEN_RO, MNTOPT_RO }, + { TOKEN_RW, MNTOPT_RW }, + { TOKEN_SETUID, MNTOPT_SETUID }, + { TOKEN_NOSETUID, MNTOPT_NOSETUID }, + { TOKEN_EXEC, MNTOPT_EXEC }, + { TOKEN_NOEXEC, MNTOPT_NOEXEC }, + { TOKEN_DEVICES, MNTOPT_DEVICES }, + { TOKEN_NODEVICES, MNTOPT_NODEVICES }, + { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, + { TOKEN_SAXATTR, MNTOPT_SAXATTR }, + { TOKEN_XATTR, MNTOPT_XATTR }, + { TOKEN_NOXATTR, MNTOPT_NOXATTR }, + { TOKEN_ATIME, MNTOPT_ATIME }, + { TOKEN_NOATIME, MNTOPT_NOATIME }, + { TOKEN_RELATIME, MNTOPT_RELATIME }, + { TOKEN_NORELATIME, MNTOPT_NORELATIME }, + { TOKEN_NBMAND, MNTOPT_NBMAND }, + { TOKEN_NONBMAND, MNTOPT_NONBMAND }, + { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, + { TOKEN_LAST, NULL }, +}; + +static void +zfsvfs_vfs_free(vfs_t *vfsp) +{ + if (vfsp != NULL) { + if (vfsp->vfs_mntpoint != NULL) + strfree(vfsp->vfs_mntpoint); + + kmem_free(vfsp, sizeof (vfs_t)); + } +} + +static int +zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) +{ + switch (token) { + case TOKEN_RO: + vfsp->vfs_readonly = B_TRUE; + vfsp->vfs_do_readonly = B_TRUE; + break; + case TOKEN_RW: + vfsp->vfs_readonly = B_FALSE; + vfsp->vfs_do_readonly = B_TRUE; + break; + case TOKEN_SETUID: + vfsp->vfs_setuid = B_TRUE; + vfsp->vfs_do_setuid = B_TRUE; + break; + case TOKEN_NOSETUID: + vfsp->vfs_setuid = B_FALSE; + vfsp->vfs_do_setuid = B_TRUE; + break; + case TOKEN_EXEC: + vfsp->vfs_exec = B_TRUE; + vfsp->vfs_do_exec = B_TRUE; + break; + case TOKEN_NOEXEC: + vfsp->vfs_exec = B_FALSE; + vfsp->vfs_do_exec = B_TRUE; + break; + case TOKEN_DEVICES: + vfsp->vfs_devices = B_TRUE; + vfsp->vfs_do_devices = B_TRUE; + break; + case TOKEN_NODEVICES: + vfsp->vfs_devices = B_FALSE; + vfsp->vfs_do_devices = B_TRUE; + break; + case TOKEN_DIRXATTR: + vfsp->vfs_xattr = ZFS_XATTR_DIR; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_SAXATTR: + vfsp->vfs_xattr = ZFS_XATTR_SA; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_XATTR: + vfsp->vfs_xattr = ZFS_XATTR_DIR; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_NOXATTR: + vfsp->vfs_xattr = ZFS_XATTR_OFF; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_ATIME: + vfsp->vfs_atime = B_TRUE; + vfsp->vfs_do_atime = B_TRUE; + break; + case TOKEN_NOATIME: + vfsp->vfs_atime = B_FALSE; + vfsp->vfs_do_atime = B_TRUE; + break; + case TOKEN_RELATIME: + vfsp->vfs_relatime = B_TRUE; + vfsp->vfs_do_relatime = B_TRUE; + break; + case TOKEN_NORELATIME: + vfsp->vfs_relatime = B_FALSE; + vfsp->vfs_do_relatime = B_TRUE; + break; + case TOKEN_NBMAND: + vfsp->vfs_nbmand = B_TRUE; + vfsp->vfs_do_nbmand = B_TRUE; + break; + case TOKEN_NONBMAND: + vfsp->vfs_nbmand = B_FALSE; + vfsp->vfs_do_nbmand = B_TRUE; + break; + case TOKEN_MNTPOINT: + vfsp->vfs_mntpoint = match_strdup(&args[0]); + if (vfsp->vfs_mntpoint == NULL) + return (SET_ERROR(ENOMEM)); + + break; + default: + break; + } + + return (0); +} + +/* + * Parse the raw mntopts and return a vfs_t describing the options. + */ +static int +zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) +{ + vfs_t *tmp_vfsp; + int error; + + tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); + + if (mntopts != NULL) { + substring_t args[MAX_OPT_ARGS]; + char *tmp_mntopts, *p, *t; + int token; + + tmp_mntopts = t = strdup(mntopts); + if (tmp_mntopts == NULL) + return (SET_ERROR(ENOMEM)); + + while ((p = strsep(&t, ",")) != NULL) { + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, zpl_tokens, args); + error = zfsvfs_parse_option(p, token, args, tmp_vfsp); + if (error) { + strfree(tmp_mntopts); + zfsvfs_vfs_free(tmp_vfsp); + return (error); + } + } + + strfree(tmp_mntopts); + } + + *vfsp = tmp_vfsp; + + return (0); +} + +boolean_t +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); +} + +/*ARGSUSED*/ +int +zfs_sync(struct super_block *sb, int wait, cred_t *cr) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + + /* + * Semantically, the only requirement is that the sync be initiated. + * The DMU syncs out txgs frequently, so there's nothing to do. + */ + if (!wait) + return (0); + + if (zfsvfs != NULL) { + /* + * Sync a specific filesystem. + */ + dsl_pool_t *dp; + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct super_block *sb = zfsvfs->z_sb; + + if (sb == NULL) + return; + /* + * Update SB_NOATIME bit in VFS super block. Since atime update is + * determined by atime_needs_update(), atime_needs_update() needs to + * return false if atime is turned off, and not unconditionally return + * false if atime is turned on. + */ + if (newval) + sb->s_flags &= ~SB_NOATIME; + else + sb->s_flags |= SB_NOATIME; +} + +static void +relatime_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_relatime = newval; +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_flags &= ~ZSB_XATTR; + } else { + zfsvfs->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +acltype_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + switch (newval) { + case ZFS_ACLTYPE_OFF: + zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; + zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; + break; + case ZFS_ACLTYPE_POSIXACL: +#ifdef CONFIG_FS_POSIX_ACL + zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL; + zfsvfs->z_sb->s_flags |= SB_POSIXACL; +#else + zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; + zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; +#endif /* CONFIG_FS_POSIX_ACL */ + break; + default: + break; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct super_block *sb = zfsvfs->z_sb; + + if (sb == NULL) + return; + + if (newval) + sb->s_flags |= SB_RDONLY; + else + sb->s_flags &= ~SB_RDONLY; +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ +} + +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct super_block *sb = zfsvfs->z_sb; + + if (sb == NULL) + return; + + if (newval == TRUE) + sb->s_flags |= SB_MANDLOCK; + else + sb->s_flags &= ~SB_MANDLOCK; +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_vscan = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { + vfsp->vfs_do_readonly = B_TRUE; + vfsp->vfs_readonly = B_TRUE; + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (vfsp->vfs_do_readonly) + readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); + if (vfsp->vfs_do_setuid) + setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); + if (vfsp->vfs_do_exec) + exec_changed_cb(zfsvfs, vfsp->vfs_exec); + if (vfsp->vfs_do_devices) + devices_changed_cb(zfsvfs, vfsp->vfs_devices); + if (vfsp->vfs_do_xattr) + xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); + if (vfsp->vfs_do_atime) + atime_changed_cb(zfsvfs, vfsp->vfs_atime); + if (vfsp->vfs_do_relatime) + relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); + if (vfsp->vfs_do_nbmand) + nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, + uint64_t *userp, uint64_t *groupp, uint64_t *projectp) +{ + sa_hdr_phys_t sa; + sa_hdr_phys_t *sap = data; + uint64_t flags; + int hdrsize; + boolean_t swap = B_FALSE; + + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (SET_ERROR(ENOENT)); + + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (SET_ERROR(EEXIST)); + + if (bonustype == DMU_OT_ZNODE) { + znode_phys_t *znp = data; + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + *projectp = ZFS_DEFAULT_PROJID; + return (0); + } + + if (sap->sa_magic == 0) { + /* + * This should only happen for newly created files + * that haven't had the znode data filled in yet. + */ + *userp = 0; + *groupp = 0; + *projectp = ZFS_DEFAULT_PROJID; + return (0); + } + + sa = *sap; + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } else { + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + } + + hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + + *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); + *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); + flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); + if (swap) + flags = BSWAP_64(flags); + + if (flags & ZFS_PROJID) + *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + + SA_PROJID_OFFSET)); + else + *projectp = ZFS_DEFAULT_PROJID; + + if (swap) { + *userp = BSWAP_64(*userp); + *groupp = BSWAP_64(*groupp); + *projectp = BSWAP_64(*projectp); + } + return (0); +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = zfs_strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + case ZFS_PROP_USEROBJUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + case ZFS_PROP_GROUPOBJUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_PROJECTUSED: + case ZFS_PROP_PROJECTOBJUSED: + return (DMU_PROJECTUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + case ZFS_PROP_USEROBJQUOTA: + return (zfsvfs->z_userobjquota_obj); + case ZFS_PROP_GROUPOBJQUOTA: + return (zfsvfs->z_groupobjquota_obj); + case ZFS_PROP_PROJECTQUOTA: + return (zfsvfs->z_projectquota_obj); + case ZFS_PROP_PROJECTOBJQUOTA: + return (zfsvfs->z_projectobjquota_obj); + default: + return (ZFS_NO_OBJECT); + } +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + int offset = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED) && + !dmu_objset_projectquota_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA) && + !dmu_objset_userobjspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == ZFS_NO_OBJECT) { + *bufsizep = 0; + return (0); + } + + if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_PROJECTOBJUSED) + offset = DMU_OBJACCT_PREFIX_LEN; + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + /* + * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) + * when dealing with block quota and vice versa. + */ + if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, + DMU_OBJACCT_PREFIX_LEN) == 0)) + continue; + + fuidstr_to_sid(zfsvfs, za.za_name + offset, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (SET_ERROR(ENOENT)); + } + fuid = FUID_ENCODE(domainid, rid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + int offset = 0; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA) && + !dmu_objset_userobjspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + } + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == ZFS_NO_OBJECT) + return (0); + + if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_PROJECTOBJUSED) { + strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); + offset = DMU_OBJACCT_PREFIX_LEN; + } + + err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (SET_ERROR(ENOTSUP)); + + switch (type) { + case ZFS_PROP_USERQUOTA: + objp = &zfsvfs->z_userquota_obj; + break; + case ZFS_PROP_GROUPQUOTA: + objp = &zfsvfs->z_groupquota_obj; + break; + case ZFS_PROP_USEROBJQUOTA: + objp = &zfsvfs->z_userobjquota_obj; + break; + case ZFS_PROP_GROUPOBJQUOTA: + objp = &zfsvfs->z_groupobjquota_obj; + break; + case ZFS_PROP_PROJECTQUOTA: + if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + + objp = &zfsvfs->z_projectquota_obj; + break; + case ZFS_PROP_PROJECTOBJQUOTA: + if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + + objp = &zfsvfs->z_projectobjquota_obj; + break; + default: + return (SET_ERROR(EINVAL)); + } + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + uint64_t used, quota, quotaobj; + int err; + + if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { + if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + + if (usedobj == DMU_PROJECTUSED_OBJECT) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { + if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + quotaobj = zfsvfs->z_projectobjquota_obj; + } else if (usedobj == DMU_USERUSED_OBJECT) { + quotaobj = zfsvfs->z_userobjquota_obj; + } else if (usedobj == DMU_GROUPUSED_OBJECT) { + quotaobj = zfsvfs->z_groupobjquota_obj; + } else { + return (B_FALSE); + } + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + char buf[20]; + uint64_t used, quota, quotaobj; + int err; + + if (usedobj == DMU_PROJECTUSED_OBJECT) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { + if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + quotaobj = zfsvfs->z_projectquota_obj; + } else if (usedobj == DMU_USERUSED_OBJECT) { + quotaobj = zfsvfs->z_userquota_obj; + } else if (usedobj == DMU_GROUPUSED_OBJECT) { + quotaobj = zfsvfs->z_groupquota_obj; + } else { + return (B_FALSE); + } + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + return (zfs_id_overblockquota(zfsvfs, usedobj, id) || + zfs_id_overobjquota(zfsvfs, usedobj, id)); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printk("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.\n", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) + return (error); + zfsvfs->z_acl_type = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); + if ((error == 0) && (val == ZFS_XATTR_SA)) + zfsvfs->z_xattr_sa = B_TRUE; + } + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], + 8, 1, &zfsvfs->z_projectquota_obj); + if (error == ENOENT) + zfsvfs->z_projectquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], + 8, 1, &zfsvfs->z_userobjquota_obj); + if (error == ENOENT) + zfsvfs->z_userobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], + 8, 1, &zfsvfs->z_groupobjquota_obj); + if (error == ENOENT) + zfsvfs->z_groupobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], + 8, 1, &zfsvfs->z_projectobjquota_obj); + if (error == ENOENT) + zfsvfs->z_projectobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + return (0); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + return (error); +} + + +/* + * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function + * on a failure. Do not pass in a statically allocated zfsvfs. + */ +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_sb = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + + int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), + ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (int i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + *zfvp = NULL; + zfsvfs_free(zfsvfs); + return (error); + } + + zfsvfs->z_drain_task = TASKQID_INVALID; + zfsvfs->z_draining = B_FALSE; + zfsvfs->z_drain_cancel = B_TRUE; + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + boolean_t readonly = zfs_is_readonly(zfsvfs); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + if (readonly != 0) { + readonly_changed_cb(zfsvfs, B_FALSE); + } else { + zap_stats_t zs; + if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + &zs) == 0) { + dataset_kstats_update_nunlinks_kstat( + &zfsvfs->z_kstat, zs.zs_num_entries); + } + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "num_entries in unlinked set: %llu", + zs.zs_num_entries); + zfs_unlinked_drain(zfsvfs); + } + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } + } + + /* restore readonly bit */ + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_TRUE); + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i, size = zfsvfs->z_hold_size; + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + zfsvfs_vfs_free(zfsvfs->z_vfs); + dataset_kstats_destroy(&zfsvfs->z_kstat); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +#ifdef HAVE_MLSLABEL +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : EACCES); + } + return (SET_ERROR(EACCES)); +} +#endif /* HAVE_MLSLABEL */ + +static int +zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, + uint32_t bshift) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + uint64_t offset = DMU_OBJACCT_PREFIX_LEN; + uint64_t quota; + uint64_t used; + int err; + + strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); + err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE); + if (err) + return (err); + + if (zfsvfs->z_projectquota_obj == 0) + goto objs; + + err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, + buf + offset, 8, 1, "a); + if (err == ENOENT) + goto objs; + else if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, + buf + offset, 8, 1, &used); + if (unlikely(err == ENOENT)) { + uint32_t blksize; + u_longlong_t nblocks; + + /* + * Quota accounting is async, so it is possible race case. + * There is at least one object with the given project ID. + */ + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + if (unlikely(zp->z_blksz == 0)) + blksize = zfsvfs->z_max_blksz; + + used = blksize * nblocks; + } else if (err) { + return (err); + } + + statp->f_blocks = quota >> bshift; + statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; + statp->f_bavail = statp->f_bfree; + +objs: + if (zfsvfs->z_projectobjquota_obj == 0) + return (0); + + err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, + buf + offset, 8, 1, "a); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, + buf, 8, 1, &used); + if (unlikely(err == ENOENT)) { + /* + * Quota accounting is async, so it is possible race case. + * There is at least one object with the given project ID. + */ + used = 1; + } else if (err) { + return (err); + } + + statp->f_files = quota; + statp->f_ffree = (quota > used) ? (quota - used) : 0; + + return (0); +} + +int +zfs_statvfs(struct dentry *dentry, struct kstatfs *statp) +{ + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + int err = 0; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); + /* + * The underlying storage pool actually uses multiple block + * size. Under Solaris frsize (fragment size) is reported as + * the smallest block size we support, and bsize (block size) + * as the filesystem's maximum block size. Unfortunately, + * under Linux the fragment size and block size are often used + * interchangeably. Thus we are forced to report both of them + * as the filesystem's maximum block size. + */ + statp->f_frsize = zfsvfs->z_max_blksz; + statp->f_bsize = zfsvfs->z_max_blksz; + uint32_t bshift = fls(statp->f_bsize) - 1; + + /* + * The following report "total" blocks of various kinds in + * the file system, but reported in terms of f_bsize - the + * "preferred" size. + */ + + /* Round up so we never have a filesystem using 0 blocks. */ + refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); + statp->f_blocks = (refdbytes + availbytes) >> bshift; + statp->f_bfree = availbytes >> bshift; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of objects available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); + statp->f_files = statp->f_ffree + usedobjs; + statp->f_fsid.val[0] = (uint32_t)fsid; + statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); + statp->f_type = ZFS_SUPER_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + /* + * We have all of 40 characters to stuff a string here. + * Is there anything useful we could/should provide? + */ + bzero(statp->f_spare, sizeof (statp->f_spare)); + + if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + dmu_objset_projectquota_present(zfsvfs->z_os)) { + znode_t *zp = ITOZ(dentry->d_inode); + + if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && + zpl_is_valid_projid(zp->z_projid)) + err = zfs_statfs_project(zfsvfs, zp, statp, bshift); + } + + ZFS_EXIT(zfsvfs); + return (err); +} + +int +zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) +{ + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *ipp = ZTOI(rootzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifdef HAVE_D_PRUNE_ALIASES +/* + * Linux kernels older than 3.1 do not support a per-filesystem shrinker. + * To accommodate this we must improvise and manually walk the list of znodes + * attempting to prune dentries in order to be able to drop the inodes. + * + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + iput(ZTOI(zp)); + } + + kmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} +#endif /* HAVE_D_PRUNE_ALIASES */ + +/* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data + * blocks but can't because they are all pinned by entries in these caches. + */ +int +zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + int error = 0; +#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK) + struct shrinker *shrinker = &sb->s_shrink; + struct shrink_control sc = { + .nr_to_scan = nr_to_scan, + .gfp_mask = GFP_KERNEL, + }; +#endif + + ZFS_ENTER(zfsvfs); + +#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ + defined(SHRINK_CONTROL_HAS_NID) && \ + defined(SHRINKER_NUMA_AWARE) + if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { + *objects = 0; + for_each_online_node(sc.nid) { + *objects += (*shrinker->scan_objects)(shrinker, &sc); + } + } else { + *objects = (*shrinker->scan_objects)(shrinker, &sc); + } + +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + *objects = (*shrinker->scan_objects)(shrinker, &sc); +#elif defined(HAVE_SHRINK) + *objects = (*shrinker->shrink)(shrinker, &sc); +#elif defined(HAVE_D_PRUNE_ALIASES) +#define D_PRUNE_ALIASES_IS_DEFAULT + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); +#else +#error "No available dentry and inode cache pruning mechanism." +#endif + +#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) +#undef D_PRUNE_ALIASES_IS_DEFAULT + /* + * Fall back to zfs_prune_aliases if the kernel's per-superblock + * shrinker couldn't free anything, possibly due to the inodes being + * allocated in a different memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); +#endif + + ZFS_EXIT(zfsvfs); + + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "pruning, nr_to_scan=%lu objects=%d error=%d\n", + nr_to_scan, *objects, error); + + return (error); +} + +/* + * Teardown the zfsvfs_t. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* + * If someone has not already unmounted this file system, + * drain the iput_taskq to ensure all active references to the + * zfsvfs_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but iputs run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (zfsvfs->z_nr_znodes > 0) { + taskq_wait_outstanding(dsl_pool_iput_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + } + } + + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's super block as the + * parent filesystem and all of its snapshots have their + * inode's super block set to the parent's filesystem's + * super block. Note, 'z_parent' is self referential + * for non-snapshots. + */ + shrink_dcache_sb(zfsvfs->z_parent->z_sb); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no VFS ops active, and any new VFS ops + * will fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. + */ + if (!unmounting) { + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_sa_hdl) + zfs_znode_dmu_fini(zp); + if (igrab(ZTOI(zp)) != NULL) + zp->z_suspended = B_TRUE; + + } + mutex_exit(&zfsvfs->z_znodes_lock); + } + + /* + * If we are unmounting, set the unmounted flag and let new VFS ops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other VFS ops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data. We must write out any dirty data before + * disowning the dataset. + */ + objset_t *os = zfsvfs->z_os; + boolean_t os_dirty = B_FALSE; + for (int t = 0; t < TXG_SIZE; t++) { + if (dmu_objset_is_dirty(os, t)) { + os_dirty = B_TRUE; + break; + } + } + if (!zfs_is_readonly(zfsvfs) && os_dirty) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + } + dmu_objset_evict_dbufs(zfsvfs->z_os); + + return (0); +} + +#if !defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) && \ + !defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) +atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); +#endif + +int +zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) +{ + const char *osname = zm->mnt_osname; + struct inode *root_inode; + uint64_t recordsize; + int error = 0; + zfsvfs_t *zfsvfs = NULL; + vfs_t *vfs = NULL; + + ASSERT(zm); + ASSERT(osname); + + error = zfsvfs_parse_options(zm->mnt_data, &vfs); + if (error) + return (error); + + error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); + if (error) { + zfsvfs_vfs_free(vfs); + goto out; + } + + if ((error = dsl_prop_get_integer(osname, "recordsize", + &recordsize, NULL))) { + zfsvfs_vfs_free(vfs); + goto out; + } + + vfs->vfs_data = zfsvfs; + zfsvfs->z_vfs = vfs; + zfsvfs->z_sb = sb; + sb->s_fs_info = zfsvfs; + sb->s_magic = ZFS_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_blocksize = recordsize; + sb->s_blocksize_bits = ilog2(recordsize); + + error = -zpl_bdi_setup(sb, "zfs"); + if (error) + goto out; + + sb->s_bdi->ra_pages = 0; + + /* Set callback operations for the file system. */ + sb->s_op = &zpl_super_operations; + sb->s_xattr = zpl_xattr_handlers; + sb->s_export_op = &zpl_export_operations; +#ifdef HAVE_S_D_OP + sb->s_d_op = &zpl_dentry_operations; +#endif /* HAVE_S_D_OP */ + + /* Set features for file system. */ + zfs_set_fuid_feature(zfsvfs); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, + "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + if ((error = dsl_prop_get_integer(osname, + "acltype", &pval, NULL))) + goto out; + acltype_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + zfsvfs->z_snap_defer_time = jiffies; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + /* Allocate a root inode for the filesystem. */ + error = zfs_root(zfsvfs, &root_inode); + if (error) { + (void) zfs_umount(sb); + goto out; + } + + /* Allocate a root dentry for the filesystem */ + sb->s_root = d_make_root(root_inode); + if (sb->s_root == NULL) { + (void) zfs_umount(sb); + error = SET_ERROR(ENOMEM); + goto out; + } + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); + + zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); +out: + if (error) { + if (zfsvfs != NULL) { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } + /* + * make sure we don't have dangling sb->s_fs_info which + * zfs_preumount will use. + */ + sb->s_fs_info = NULL; + } + + return (error); +} + +/* + * Called when an unmount is requested and certain sanity checks have + * already passed. At this point no dentries or inodes have been reclaimed + * from their respective caches. We drop the extra reference on the .zfs + * control directory to allow everything to be reclaimed. All snapshots + * must already have been unmounted to reach this point. + */ +void +zfs_preumount(struct super_block *sb) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + + /* zfsvfs is NULL when zfs_domount fails during mount */ + if (zfsvfs) { + zfs_unlinked_drain_stop_wait(zfsvfs); + zfsctl_destroy(sb->s_fs_info); + /* + * Wait for iput_async before entering evict_inodes in + * generic_shutdown_super. The reason we must finish before + * evict_inodes is when lazytime is on, or when zfs_purgedir + * calls zfs_zget, iput would bump i_count from 0 to 1. This + * would race with the i_count check in evict_inodes. This means + * it could destroy the inode while we are still using it. + * + * We wait for two passes. xattr directories in the first pass + * may add xattr entries in zfs_purgedir, so in the second pass + * we wait for them. We don't use taskq_wait here because it is + * a pool wide taskq. Other mounted filesystems can constantly + * do iput_async and there's no guarantee when taskq will be + * empty. + */ + taskq_wait_outstanding(dsl_pool_iput_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + taskq_wait_outstanding(dsl_pool_iput_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + } +} + +/* + * Called once all other unmount released tear down has occurred. + * It is our responsibility to release any remaining infrastructure. + */ +/*ARGSUSED*/ +int +zfs_umount(struct super_block *sb) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + objset_t *os; + + if (zfsvfs->z_arc_prune != NULL) + arc_remove_prune_callback(zfsvfs->z_arc_prune); + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + zpl_bdi_destroy(sb); + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + zfsvfs_free(zfsvfs); + return (0); +} + +int +zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + vfs_t *vfsp; + boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); + int error; + + if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && + !(*flags & SB_RDONLY)) { + *flags |= SB_RDONLY; + return (EROFS); + } + + error = zfsvfs_parse_options(zm->mnt_data, &vfsp); + if (error) + return (error); + + if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + + zfs_unregister_callbacks(zfsvfs); + zfsvfs_vfs_free(zfsvfs->z_vfs); + + vfsp->vfs_data = zfsvfs; + zfsvfs->z_vfs = vfsp; + if (!issnap) + (void) zfs_register_callbacks(vfsp); + + return (error); +} + +int +zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + znode_t *zp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *ipp = NULL; + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + return (SET_ERROR(EINVAL)); + } + + /* LONG_FID_LEN means snapdirs */ + if (fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { + dprintf("snapdir fid: objsetid (%llu) != " + "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", + objsetid, ZFSCTL_INO_SNAPDIRS, object); + + return (SET_ERROR(EINVAL)); + } + + if (fid_gen > 1 || setgen != 0) { + dprintf("snapdir fid: fid_gen (%llu) and setgen " + "(%llu)\n", fid_gen, setgen); + return (SET_ERROR(EINVAL)); + } + + return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); + } + + ZFS_ENTER(zfsvfs); + /* A zero fid_gen means we are in the .zfs control directories */ + if (fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { + *ipp = zfsvfs->z_ctldir; + ASSERT(*ipp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { + VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, + 0, kcred, NULL, NULL) == 0); + } else { + igrab(*ipp); + } + ZFS_EXIT(zfsvfs); + return (0); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); + if ((err = zfs_zget(zfsvfs, object, &zp))) { + ZFS_EXIT(zfsvfs); + return (err); + } + + /* Don't export xattr stuff */ + if (zp->z_pflags & ZFS_XATTR) { + iput(ZTOI(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOENT)); + } + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if ((fid_gen == 0) && (zfsvfs->z_root == object)) + fid_gen = zp_gen; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, + fid_gen); + iput(ZTOI(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOENT)); + } + + *ipp = ZTOI(zp); + if (*ipp) + zfs_inode_update(ITOZ(*ipp)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Block out VFS ops and close zfsvfs_t + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err, err2; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + zfsvfs->z_rollback_time = jiffies; + + /* + * Attempt to re-establish all the active inodes with their + * dbufs. If a zfs_rezget() fails, then we unhash the inode + * and mark it stale. This prevents a collision if a new + * inode/object is created which must use the same inode + * number. The stale inode will be be released when the + * VFS prunes the dentry holding the remaining references + * on the stale inode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + err2 = zfs_rezget(zp); + if (err2) { + remove_inode_hash(ZTOI(zp)); + zp->z_is_stale = B_TRUE; + } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + zfs_iput_async(ZTOI(zp)); + zp->z_suspended = B_FALSE; + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { + /* + * zfs_suspend_fs() could have interrupted freeing + * of dnodes. We need to restart this freeing so + * that we don't "leak" the space. + */ + zfs_unlinked_drain(zfsvfs); + } + +bail: + if (err != 0) + zfsvfs->z_unmounted = B_TRUE; + + /* release the VFS ops */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err != 0) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (zfsvfs->z_os) + (void) zfs_umount(zfsvfs->z_sb); + } + return (err); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_sb); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %llu to %llu", zfsvfs->z_version, newvers); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the corresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_unmounted) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +struct objnode { + avl_node_t node; + uint64_t obj; +}; + +static int +objnode_compare(const void *o1, const void *o2) +{ + const struct objnode *obj1 = o1; + const struct objnode *obj2 = o2; + if (obj1->obj < obj2->obj) + return (-1); + if (obj1->obj > obj2->obj) + return (1); + return (0); +} + +objlist_t * +zfs_get_deleteq(objset_t *os) +{ + objlist_t *deleteq_objlist = objlist_create(); + uint64_t deleteq_obj; + zap_cursor_t zc; + zap_attribute_t za; + dmu_object_info_t doi; + + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); + + VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + + /* + * In order to insert objects into the objlist, they must be in sorted + * order. We don't know what order we'll get them out of the ZAP in, so + * we insert them into and remove them from an avl_tree_t to sort them. + */ + avl_tree_t at; + avl_create(&at, objnode_compare, sizeof (struct objnode), + offsetof(struct objnode, node)); + + for (zap_cursor_init(&zc, os, deleteq_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); + obj->obj = za.za_first_integer; + avl_add(&at, obj); + } + zap_cursor_fini(&zc); + + struct objnode *next, *found = avl_first(&at); + while (found != NULL) { + next = AVL_NEXT(&at, found); + objlist_insert(deleteq_objlist, found->obj); + found = next; + } + + void *cookie = NULL; + while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) + kmem_free(found, sizeof (*found)); + avl_destroy(&at); + return (deleteq_objlist); +} + + +void +zfs_init(void) +{ + zfsctl_init(); + zfs_znode_init(); + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + register_filesystem(&zpl_fs_type); +} + +void +zfs_fini(void) +{ + /* + * we don't use outstanding because zpl_posix_acl_free might add more. + */ + taskq_wait(system_delay_taskq); + taskq_wait(system_taskq); + unregister_filesystem(&zpl_fs_type); + zfs_znode_fini(); + zfsctl_fini(); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_suspend_fs); +EXPORT_SYMBOL(zfs_resume_fs); +EXPORT_SYMBOL(zfs_userspace_one); +EXPORT_SYMBOL(zfs_userspace_many); +EXPORT_SYMBOL(zfs_set_userquota); +EXPORT_SYMBOL(zfs_id_overblockquota); +EXPORT_SYMBOL(zfs_id_overobjquota); +EXPORT_SYMBOL(zfs_id_overquota); +EXPORT_SYMBOL(zfs_set_version); +EXPORT_SYMBOL(zfsvfs_create); +EXPORT_SYMBOL(zfsvfs_free); +EXPORT_SYMBOL(zfs_is_readonly); +EXPORT_SYMBOL(zfs_domount); +EXPORT_SYMBOL(zfs_preumount); +EXPORT_SYMBOL(zfs_umount); +EXPORT_SYMBOL(zfs_remount); +EXPORT_SYMBOL(zfs_statvfs); +EXPORT_SYMBOL(zfs_vget); +EXPORT_SYMBOL(zfs_prune); +#endif diff --git a/module/os/linux/zfs/zfs_vnops.c b/module/os/linux/zfs/zfs_vnops.c new file mode 100644 index 000000000..de7b59935 --- /dev/null +++ b/module/os/linux/zfs/zfs_vnops.c @@ -0,0 +1,5275 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/kmem.h> +#include <sys/taskq.h> +#include <sys/uio.h> +#include <sys/vmsystm.h> +#include <sys/atomic.h> +#include <sys/pathname.h> +#include <sys/cmn_err.h> +#include <sys/errno.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/fs/zfs.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/spa.h> +#include <sys/txg.h> +#include <sys/dbuf.h> +#include <sys/zap.h> +#include <sys/sa.h> +#include <sys/policy.h> +#include <sys/sunddi.h> +#include <sys/sid.h> +#include <sys/mode.h> +#include <sys/zfs_ctldir.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_rlock.h> +#include <sys/cred.h> +#include <sys/zpl.h> +#include <sys/zil.h> +#include <sys/sa_impl.h> + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) iput() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call iput() within a tx then use zfs_iput_async(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * iput(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * iput(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* + * Virus scanning is unsupported. It would be possible to add a hook + * here to performance the required virus scan. This could be done + * entirely in the kernel or potentially as an update to invoke a + * scanning utility. + */ +static int +zfs_vscan(struct inode *ip, cred_t *cr, int async) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & O_APPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Virus scan eligible files on open */ + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (zfs_vscan(ip, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(struct inode *ip, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(zfs_vscan(ip, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +/* + * Lseek support for finding holes (cmd == SEEK_HOLE) and + * data (cmd == SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey_common(struct inode *ip, int cmd, loff_t *off) +{ + znode_t *zp = ITOZ(ip); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +int +zfs_holey(struct inode *ip, int cmd, loff_t *off) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_holey_common(ip, cmd, off); + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* SEEK_HOLE && SEEK_DATA */ + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(struct inode *ip, int64_t start, int len, + objset_t *os, uint64_t oid) +{ + struct address_space *mp = ip->i_mapping; + struct page *pp; + uint64_t nbytes; + int64_t off; + void *pb; + + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + nbytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + pb = kmap(pp); + (void) dmu_read(os, oid, start+off, nbytes, pb+off, + DMU_READ_PREFETCH); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + SetPageUptodate(pp); + ClearPageError(pp); + unlock_page(pp); + put_page(pp); + } + + len -= nbytes; + off = 0; + } +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(struct inode *ip, int nbytes, uio_t *uio) +{ + struct address_space *mp = ip->i_mapping; + struct page *pp; + znode_t *zp = ITOZ(ip); + int64_t start, off; + uint64_t bytes; + int len = nbytes; + int error = 0; + void *pb; + + start = uio->uio_loffset; + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + bytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + ASSERT(PageUptodate(pp)); + unlock_page(pp); + + pb = kmap(pp); + error = uiomove(pb + off, bytes, UIO_READ, uio); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + put_page(pp); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + } + + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} +#endif /* _KERNEL */ + +unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: ip - inode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - FSYNC flags; used to provide FRSYNC semantics. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * inode - atime updated if byte count > 0 + */ +/* ARGSUSED */ +int +zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + boolean_t frsync = B_FALSE; + + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef FRSYNC + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + * Only do this for non-snapshots. + * + * Some platforms do not support FRSYNC and instead map it + * to FSYNC, which results in unnecessary calls to zil_commit. We + * only honor FRSYNC requests on platforms which support it. + */ + frsync = !!(ioflag & FRSYNC); +#endif + if (zfsvfs->z_log && + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ssize_t start_resid = n; + +#ifdef HAVE_UIO_ZEROCOPY + xuio_t *xuio = NULL; + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio->uio_loffset; + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + (void) dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(ip)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + (void) dmu_xuio_add(xuio, + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); + } + } + } +#endif /* HAVE_UIO_ZEROCOPY */ + + while (n > 0) { + ssize_t nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { + error = mappedread(ip, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } + + int64_t nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); + task_io_account_read(nread); +out: + rangelock_exit(lr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: ip - inode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND flag set if in append mode. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +int +zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + ssize_t start_resid = uio->uio_resid; + + /* + * Fasttrack empty write + */ + ssize_t n = start_resid; + if (n == 0) + return (0); + + rlim64_t limit = uio->uio_limit; + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + sa_bulk_attr_t bulk[4]; + int count = 0; + uint64_t mtime[2], ctime[2]; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM + */ + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Validate file offset + */ + offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + int max_blksz = zfsvfs->z_max_blksz; + xuio_t *xuio = NULL; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ +#ifdef HAVE_UIO_ZEROCOPY + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else +#endif + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + locked_range_t *lr; + if (ioflag & FAPPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (woff >= limit) { + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + int write_eof = (woff + n > zp->z_size); + + uint64_t end_size = MAX(zp->z_size, woff + n); + zilog_t *zilog = zfsvfs->z_log; +#ifdef HAVE_UIO_ZEROCOPY + int i_iov = 0; + const iovec_t *iovp = uio->uio_iov; + ASSERTV(int iovcnt = uio->uio_iovcnt); +#endif + + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio->uio_loffset; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + arc_buf_t *abuf = NULL; + const iovec_t *aiov = NULL; + if (xuio) { +#ifdef HAVE_UIO_ZEROCOPY + ASSERT(i_iov < iovcnt); + ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; +#endif + } else if (n >= max_blksz && woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If rangelock_enter() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + ssize_t tx_bytes; + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + uio->uio_fault_disable = B_TRUE; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + uio->uio_fault_disable = B_FALSE; + if (error == EFAULT) { + dmu_tx_commit(tx); + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + break; + } + continue; + } else if (error != 0) { + dmu_tx_commit(tx); + break; + } + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + /* cppcheck-suppress nullPointer */ + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { + update_pages(ip, woff, + tx_bytes, zfsvfs->z_os, zp->z_id); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + uint32_t uid = KUID_TO_SUID(ip->i_uid); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + ip->i_mode = newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); + ASSERT(error == 0); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + + if (!xuio && n > 0) { + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = EFAULT; + break; + } + } + } + + zfs_inode_update(zp); + rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + int64_t nwritten = start_resid - uio->uio_resid; + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); + task_io_account_write(nwritten); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Drop a reference on the passed inode asynchronously. This ensures + * that the caller will never drop the last reference on an inode in + * the current context. Doing so while holding open a tx could result + * in a deadlock if iput_final() re-enters the filesystem code. + */ +void +zfs_iput_async(struct inode *ip) +{ + objset_t *os = ITOZSB(ip)->z_os; + + ASSERT(atomic_read(&ip->i_count) > 0); + ASSERT(os != NULL); + + if (atomic_read(&ip->i_count) == 1) + VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)), + (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); + else + iput(ip); +} + +/* ARGSUSED */ +void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_iput_async(ZTOI(zp)); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_iput_async(ZTOI(zp)); + return (SET_ERROR(ENOENT)); + } + + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held inode reference for it. + * + * IN: dip - inode of directory to search. + * nm - name of entry to lookup. + * flags - LOOKUP_XATTR set if looking for an attribute. + * cr - credentials of caller. + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: ipp - inode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags, + cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + znode_t *zdp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (!S_ISDIR(dip->i_mode)) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *ipp = dip; + igrab(*ipp); + return (0); + } + return (error); + } + } + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *ipp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0, + B_FALSE, cr))) { + iput(*ipp); + *ipp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + if (!S_ISDIR(dip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp); + if ((error == 0) && (*ipp)) + zfs_inode_update(ITOZ(*ipp)); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the ip of the created or trunc'd file. + * + * IN: dip - inode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * vsecp - ACL to be set + * + * OUT: ipp - inode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated if new entry created + * ip - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *ipp = NULL; + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + igrab(dip); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible igrab(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + } else { + int aflags = (flag & FAPPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if (S_ISREG(ZTOI(zp)->i_mode) && + (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + if (dl) { + zfs_dirent_unlock(dl); + dl = NULL; + } + error = zfs_freesp(zp, 0, 0, mode, TRUE); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + iput(ZTOI(zp)); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *ipp = ZTOI(zp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +int +zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL, *dzp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + objset_t *os; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + uint64_t projid = ZFS_DEFAULT_PROJID; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *ipp = NULL; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* Add to unlinked set */ + zp->z_unlinked = B_TRUE; + zfs_unlinked_add(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); +out: + + if (error) { + if (zp) + iput(ZTOI(zp)); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *ipp = ZTOI(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dip - inode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dip - ctime|mtime + * ip - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags) +{ + znode_t *zp, *dzp = ITOZ(dip); + znode_t *xzp; + struct inode *ip; + zfsvfs_t *zfsvfs = ITOZSB(dip); + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + uint64_t links; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + ip = ZTOI(zp); + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (S_ISDIR(ip->i_mode)) { + error = SET_ERROR(EPERM); + goto out; + } + + mutex_enter(&zp->z_lock); + may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + iput(ip); + if (xzp) + iput(ZTOI(xzp)); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + iput(ip); + if (xzp) + iput(ZTOI(xzp)); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) && + xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == + acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(xzp)); + links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + zfs_inode_update(dzp); + zfs_inode_update(zp); + + if (delete_now) + iput(ip); + else + zfs_iput_async(ip); + + if (xzp) { + zfs_inode_update(xzp); + zfs_iput_async(ZTOI(xzp)); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dip using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dip - inode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * flags - case flags. + * vsecp - ACL to be set + * + * OUT: ipp - inode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dip - ctime|mtime updated + * ipp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp, *dzp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(S_ISDIR(vap->va_mode)); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *ipp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + /* + * Now put new name in parent dir. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + *ipp = ZTOI(zp); + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + +out: + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (error != 0) { + iput(ZTOI(zp)); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + } + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dip - inode of directory to remove from. + * name - name of directory to be removed. + * cwd - inode of current working directory. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr, + int flags) +{ + znode_t *dzp = ITOZ(dip); + znode_t *zp; + struct inode *ip; + zfsvfs_t *zfsvfs = ITOZSB(dip); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + ip = ZTOI(zp); + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (!S_ISDIR(ip->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (ip == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * Grab a lock on the directory to make sure that no one is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + iput(ip); + goto top; + } + dmu_tx_abort(tx); + iput(ip); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + zfs_inode_update(dzp); + zfs_inode_update(zp); + iput(ip); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. + * + * IN: ip - inode of directory to read. + * ctx - directory entry context. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +int +zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + zap_cursor_t zc; + zap_attribute_t zap; + int error; + uint8_t prefetch; + uint8_t type; + int done = 0; + uint64_t parent; + uint64_t offset; /* must be unsigned; checks for < 1 */ + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + goto out; + + /* + * Quit if directory has been removed (posix) + */ + if (zp->z_unlinked) + goto out; + + error = 0; + os = zfsvfs->z_os; + offset = ctx->pos; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Transform to file-system independent format + */ + while (!done) { + uint64_t objnum; + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if (error == ENOENT) + break; + else + goto update; + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (zap.za_integer_length != 8 || + zap.za_num_integers == 0) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld, " + "length = %d, num = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset, + zap.za_integer_length, + (u_longlong_t)zap.za_num_integers); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + } + + done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), + objnum, type); + if (done) + break; + + /* Prefetch znode */ + if (prefetch) { + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + ctx->pos = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + +update: + zap_cursor_fini(&zc); + if (error == ENOENT) + error = 0; +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +int +zfs_fsync(struct inode *ip, int syncflag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: ip - inode of file. + * vap - va_mask identifies requested attributes. + * If ATTR_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error = 0; + uint64_t links; + uint64_t atime[2], mtime[2], ctime[2]; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[3]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + mutex_enter(&zp->z_lock); + vap->va_type = vn_mode_to_vtype(zp->z_mode); + vap->va_mode = zp->z_mode; + vap->va_fsid = ZTOI(zp)->i_sb->s_dev; + vap->va_nodeid = zp->z_id; + if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) + links = ZTOI(zp)->i_nlink + 1; + else + links = ZTOI(zp)->i_nlink; + vap->va_nlink = MIN(links, ZFS_LINK_MAX); + vap->va_size = i_size_read(ip); + vap->va_rdev = ip->i_rdev; + vap->va_seq = ip->i_generation; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + S_ISREG(ip->i_mode)) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + times, sizeof (times)); + ZFS_TIME_DECODE(&xoap->xoa_createtime, times); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = ip->i_generation; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + xoap->xoa_projinherit = + ((zp->z_pflags & ZFS_PROJINHERIT) != 0); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + xoap->xoa_projid = zp->z_projid; + XVA_SET_RTN(xvap, XAT_PROJID); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + + mutex_exit(&zp->z_lock); + + sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks); + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Get the basic file attributes and place them in the provided kstat + * structure. The inode is assumed to be the authoritative source + * for most of the attributes. However, the znode currently has the + * authoritative atime, blksize, and block count. + * + * IN: ip - inode of file. + * + * OUT: sp - kstat values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr_fast(struct inode *ip, struct kstat *sp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t blksize; + u_longlong_t nblocks; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + + generic_fillattr(ip, sp); + /* + * +1 link count for root inode with visible '.zfs' directory. + */ + if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) + if (sp->nlink < ZFS_LINK_MAX) + sp->nlink++; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + sp->blksize = blksize; + sp->blocks = nblocks; + + if (unlikely(zp->z_blksz == 0)) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + sp->blksize = zfsvfs->z_max_blksz; + } + + mutex_exit(&zp->z_lock); + + /* + * Required to prevent NFS client from detecting different inode + * numbers of snapshot root dentry before and after snapshot mount. + */ + if (zfsvfs->z_issnap) { + if (ip->i_sb->s_root->d_inode == ip) + sp->ino = ZFSCTL_INO_SNAPDIRS - + dmu_objset_id(zfsvfs->z_os); + } + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct inode *dxip = ZTOI(dzp); + struct inode *xip = NULL; + zfsvfs_t *zfsvfs = ITOZSB(dxip); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && + KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { + xip->i_uid = dxip->i_uid; + uid = zfs_uid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { + xip->i_gid = dxip->i_gid; + gid = zfs_gid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (xip) { + iput(xip); + xip = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (xip) { + iput(xip); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: ip - inode of file to be modified. + * vap - new attribute values. + * If ATTR_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], atime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2 = 0; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0, bulks = 8; + + if (mask == 0) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & ATTR_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || + ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } + + if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { + err = SET_ERROR(EPERM); + goto out3; + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (ATTR_ATIME | ATTR_MTIME)) { + if (((mask & ATTR_ATIME) && + TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & ATTR_MTIME) && + TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfs_is_readonly(zfsvfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & ATTR_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (ATTR_ATIME|ATTR_MTIME) || + ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (ATTR_UID|ATTR_GID)) { + int idmask = (mask & (ATTR_UID|ATTR_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & ATTR_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & ATTR_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both ATTR_UID and ATTR_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (ATTR_UID|ATTR_GID)) && + take_owner && take_group) || + ((idmask == ATTR_UID) && take_owner) || + ((idmask == ATTR_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + (void) secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (ATTR_UID|ATTR_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & ATTR_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!S_ISREG(ip->i_mode) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & ATTR_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(ip, vap, + &oldva, cr); + if (err) + goto out3; + + trim_mask |= ATTR_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & ATTR_UID) { + new_kuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { + if (attrzp) + iput(ZTOI(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & ATTR_GID) { + new_kgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); + if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { + if (attrzp) + iput(ZTOI(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + iput(ZTOI(attrzp)); + err = EDQUOT; + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & ATTR_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + zfs_acl_chmod_setattr(zp, &aclp, new_mode); + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (ATTR_UID|ATTR_GID)) { + + if (mask & ATTR_UID) { + ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); + new_uid = zfs_uid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); + } + } + + if (mask & ATTR_GID) { + ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); + new_gid = zfs_gid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); + } + } + if (!(mask & ATTR_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & ATTR_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = ZTOI(zp)->i_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { + zp->z_atime_dirty = B_FALSE; + ZFS_TIME_ENCODE(&ip->i_atime, atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, sizeof (atime)); + } + + if (mask & (ATTR_MTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime, + ZTOI(zp)->i_sb->s_time_gran); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & (ATTR_CTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime, + ZTOI(zp)->i_sb->s_time_gran); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + if (attrzp && mask) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, &ctime, + sizeof (ctime)); + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & ATTR_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(S_ISREG(ip->i_mode)); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + if (err == 0 && xattr_count > 0) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (attrzp) + iput(ZTOI(attrzp)); + if (err == ERESTART) + goto top; + } else { + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + if (attrzp) { + if (err2 == 0 && handle_eadir) + err2 = zfs_setattr_dir(attrzp); + iput(ZTOI(attrzp)); + } + zfs_inode_update(zp); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(tmpxvattr, sizeof (xvattr_t)); + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + zfs_iput_async(ZTOI(zl->zl_znode)); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = ZTOZSB(zp)->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(ZTOZSB(zp), oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdip - Source directory containing the "old entry". + * snm - Old entry name. + * tdip - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdip,tdip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, + cred_t *cr, int flags) +{ + znode_t *tdzp, *szp, *tzp; + znode_t *sdzp = ITOZ(sdip); + zfsvfs_t *zfsvfs = ITOZSB(sdip); + zilog_t *zilog; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + tdzp = ITOZ(tdip); + ZFS_VERIFY_ZP(tdzp); + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + iput(ZTOI(tzp)); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + iput(ZTOI(szp)); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (S_ISDIR(ZTOI(szp)->i_mode)) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (S_ISDIR(ZTOI(szp)->i_mode)) { + if (!S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + iput(ZTOI(szp)); + if (tzp) + iput(ZTOI(tzp)); + goto top; + } + dmu_tx_abort(tx); + iput(ZTOI(szp)); + if (tzp) + iput(ZTOI(tzp)); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + zfs_inode_update(sdzp); + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (sdzp != tdzp) + zfs_inode_update(tdzp); + + zfs_inode_update(szp); + iput(ZTOI(szp)); + if (tzp) { + zfs_inode_update(tzp); + iput(ZTOI(tzp)); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dip - Directory to contain new symbolic link. + * name - Name of directory entry in dip. + * vap - Attributes of new entry. + * link - Name for new symlink entry. + * cr - credentials of caller. + * flags - case flags + * + * OUT: ipp - Inode for new symbolic link. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, + struct inode **ipp, cred_t *cr, int flags) +{ + znode_t *zp, *dzp = ITOZ(dip); + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ITOZSB(dip); + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + boolean_t waited = B_FALSE; + + ASSERT(S_ISLNK(vap->va_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + *ipp = NULL; + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + + zfs_inode_update(dzp); + zfs_inode_update(zp); + } + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + *ipp = ZTOI(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + iput(ZTOI(zp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by ip. + * + * IN: ip - inode of symbolic link + * uio - structure to contain the link path. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdip referencing sip. + * + * IN: tdip - Directory to contain new entry. + * sip - inode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdip - ctime|mtime updated + * sip - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr, + int flags) +{ + znode_t *dzp = ITOZ(tdip); + znode_t *tzp, *szp; + zfsvfs_t *zfsvfs = ITOZSB(tdip); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + boolean_t is_tmpfile = 0; + uint64_t txg; +#ifdef HAVE_TMPFILE + is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); +#endif + ASSERT(S_ISDIR(tdip->i_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (S_ISDIR(sip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + szp = ITOZ(sip); + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), + cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, dzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + /* unmark z_unlinked so zfs_link_create will not reject */ + if (is_tmpfile) + szp->z_unlinked = B_FALSE; + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + /* + * tmpfile is created to be in z_unlinkedobj, so remove it. + * Also, we don't log in ZIL, because all previous file + * operation on the tmpfile are ignored by ZIL. Instead we + * always wait for txg to sync to make sure all previous + * operation are sync safe. + */ + if (is_tmpfile) { + VERIFY(zap_remove_int(zfsvfs->z_os, + zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, dzp, szp, name); + } + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = B_TRUE; + } + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (is_tmpfile) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); + + zfs_inode_update(dzp); + zfs_inode_update(szp); + ZFS_EXIT(zfsvfs); + return (error); +} + +static void +zfs_putpage_commit_cb(void *arg) +{ + struct page *pp = arg; + + ClearPageError(pp); + end_page_writeback(pp); +} + +/* + * Push a page out to disk, once the page is on stable storage the + * registered commit callback will be run as notification of completion. + * + * IN: ip - page mapped for inode. + * pp - page to push (page is locked) + * wbc - writeback control data + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + loff_t offset; + loff_t pgoff; + unsigned int pglen; + dmu_tx_t *tx; + caddr_t va; + int err = 0; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int cnt = 0; + struct address_space *mapping; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + ASSERT(PageLocked(pp)); + + pgoff = page_offset(pp); /* Page byte-offset in file */ + offset = i_size_read(ip); /* File length in bytes */ + pglen = MIN(PAGE_SIZE, /* Page length in bytes */ + P2ROUNDUP(offset, PAGE_SIZE)-pgoff); + + /* Page is beyond end of file */ + if (pgoff >= offset) { + unlock_page(pp); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Truncate page length to end of file */ + if (pgoff + pglen > offset) + pglen = offset - pgoff; + +#if 0 + /* + * FIXME: Allow mmap writes past its quota. The correct fix + * is to register a page_mkwrite() handler to count the page + * against its quota when it is about to be dirtied. + */ + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + err = EDQUOT; + } +#endif + + /* + * The ordering here is critical and must adhere to the following + * rules in order to avoid deadlocking in either zfs_read() or + * zfs_free_range() due to a lock inversion. + * + * 1) The page must be unlocked prior to acquiring the range lock. + * This is critical because zfs_read() calls find_lock_page() + * which may block on the page lock while holding the range lock. + * + * 2) Before setting or clearing write back on a page the range lock + * must be held in order to prevent a lock inversion with the + * zfs_free_range() function. + * + * This presents a problem because upon entering this function the + * page lock is already held. To safely acquire the range lock the + * page lock must be dropped. This creates a window where another + * process could truncate, invalidate, dirty, or write out the page. + * + * Therefore, after successfully reacquiring the range and page locks + * the current page state is checked. In the common case everything + * will be as is expected and it can be written out. However, if + * the page state has changed it must be handled accordingly. + */ + mapping = pp->mapping; + redirty_page_for_writepage(wbc, pp); + unlock_page(pp); + + locked_range_t *lr = rangelock_enter(&zp->z_rangelock, + pgoff, pglen, RL_WRITER); + lock_page(pp); + + /* Page mapping changed or it was no longer dirty, we're done */ + if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { + unlock_page(pp); + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Another process started write block if required */ + if (PageWriteback(pp)) { + unlock_page(pp); + rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(pp)) + wait_on_page_bit(pp, PG_writeback); + } + + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Clear the dirty flag the required locks are held */ + if (!clear_page_dirty_for_io(pp)) { + unlock_page(pp); + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Counterpart for redirty_page_for_writepage() above. This page + * was in fact not skipped and should not be counted as if it were. + */ + wbc->pages_skipped--; + set_page_writeback(pp); + unlock_page(pp); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err != 0) { + if (err == ERESTART) + dmu_tx_wait(tx); + + dmu_tx_abort(tx); + __set_page_dirty_nobuffers(pp); + ClearPageError(pp); + end_page_writeback(pp); + rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (err); + } + + va = kmap(pp); + ASSERT3U(pglen, <=, PAGE_SIZE); + dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); + kunmap(pp); + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* Preserve the mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + zp->z_atime_dirty = B_FALSE; + zp->z_seq++; + + err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, + zfs_putpage_commit_cb, pp); + dmu_tx_commit(tx); + + rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ + zil_commit(zfsvfs->z_log, zp->z_id); + } + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Update the system attributes when the inode has been dirtied. For the + * moment we only update the mode, atime, mtime, and ctime. + */ +int +zfs_dirty_inode(struct inode *ip, int flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + dmu_tx_t *tx; + uint64_t mode, atime[2], mtime[2], ctime[2]; + sa_bulk_attr_t bulk[4]; + int error = 0; + int cnt = 0; + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + +#ifdef I_DIRTY_TIME + /* + * This is the lazytime semantic introduced in Linux 4.0 + * This flag will only be called from update_time when lazytime is set. + * (Note, I_DIRTY_SYNC will also set if not lazytime) + * Fortunately mtime and ctime are managed within ZFS itself, so we + * only need to dirty atime. + */ + if (flags == I_DIRTY_TIME) { + zp->z_atime_dirty = B_TRUE; + goto out; + } +#endif + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = B_FALSE; + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + /* Preserve the mode, mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_atime, atime); + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + mode = ip->i_mode; + + zp->z_mode = mode; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + mutex_exit(&zp->z_lock); + + dmu_tx_commit(tx); +out: + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(struct inode *ip) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t atime[2]; + int error; + int need_unlock = 0; + + /* Only read lock if we haven't already write locked, e.g. rollback */ + if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { + need_unlock = 1; + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + } + if (zp->z_sa_hdl == NULL) { + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + ZFS_TIME_ENCODE(&ip->i_atime, atime); + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&atime, sizeof (atime), tx); + zp->z_atime_dirty = B_FALSE; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +/* + * Bounds-check the seek operation. + * + * IN: ip - inode seeking within + * ooff - old file offset + * noffp - pointer to new file offset + * + * RETURN: 0 if success + * EINVAL if new offset invalid + */ +/* ARGSUSED */ +int +zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp) +{ + if (S_ISDIR(ip->i_mode)) + return (0); + return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); +} + +/* + * Fill pages with data from the disk. + */ +static int +zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + struct page *cur_pp; + u_offset_t io_off, total; + size_t io_len; + loff_t i_size; + unsigned page_idx; + int err; + + os = zfsvfs->z_os; + io_len = nr_pages << PAGE_SHIFT; + i_size = i_size_read(ip); + io_off = page_offset(pl[0]); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * Iterate over list of pages and read each page individually. + */ + page_idx = 0; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + + cur_pp = pl[page_idx++]; + va = kmap(cur_pp); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); + kunmap(cur_pp); + if (err) { + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + } + + return (0); +} + +/* + * Uses zfs_fillpage to read data from the file and fill the pages. + * + * IN: ip - inode of file to get data from. + * pl - list of pages to read + * nr_pages - number of pages to read + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +int +zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int err; + + if (pl == NULL) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + err = zfs_fillpage(ip, pl, nr_pages); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Check ZFS specific permissions to memory map a section of a file. + * + * IN: ip - inode of the file to mmap + * off - file offset + * addrp - start address in memory region + * len - length of memory region + * vm_flags- address flags + * + * RETURN: 0 if success + * error code if failure + */ +/*ARGSUSED*/ +int +zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, + unsigned long vm_flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((vm_flags & VM_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((vm_flags & (VM_READ | VM_EXEC)) && + (zp->z_pflags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENXIO)); + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * convoff - converts the given data (start, whence) to the + * given whence. + */ +int +convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset) +{ + vattr_t vap; + int error; + + if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) { + if ((error = zfs_getattr(ip, &vap, 0, CRED()))) + return (error); + } + + switch (lckdat->l_whence) { + case SEEK_CUR: + lckdat->l_start += offset; + break; + case SEEK_END: + lckdat->l_start += vap.va_size; + /* FALLTHRU */ + case SEEK_SET: + break; + default: + return (SET_ERROR(EINVAL)); + } + + if (lckdat->l_start < 0) + return (SET_ERROR(EINVAL)); + + switch (whence) { + case SEEK_CUR: + lckdat->l_start -= offset; + break; + case SEEK_END: + lckdat->l_start -= vap.va_size; + /* FALLTHRU */ + case SEEK_SET: + break; + default: + return (SET_ERROR(EINVAL)); + } + + lckdat->l_whence = (short)whence; + return (0); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: ip - inode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if ((error = convoff(ip, bfp, SEEK_SET, offset))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +int +zfs_fid(struct inode *ip, fid_t *fidp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = SHORT_FID_LEN; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +int +zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifdef HAVE_UIO_ZEROCOPY +/* + * The smallest read we may consider to loan out an arcbuf. + * This must be a power of 2. + */ +int zcr_blksz_min = (1 << 10); /* 1K */ +/* + * If set to less than the file block size, allow loaning out of an + * arcbuf for a partial block read. This must be a power of 2. + */ +int zcr_blksz_max = (1 << 17); /* 128K */ + +/*ARGSUSED*/ +static int +zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int max_blksz = zfsvfs->z_max_blksz; + uio_t *uio = &xuio->xu_uio; + ssize_t size = uio->uio_resid; + offset_t offset = uio->uio_loffset; + int blksz; + int fullblk, i; + arc_buf_t *abuf; + ssize_t maxsize; + int preamble, postamble; + + if (xuio->xu_type != UIOTYPE_ZEROCOPY) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + switch (ioflag) { + case UIO_WRITE: + /* + * Loan out an arc_buf for write if write size is bigger than + * max_blksz, and the file's block size is also max_blksz. + */ + blksz = max_blksz; + if (size < blksz || zp->z_blksz != blksz) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + /* + * Caller requests buffers for write before knowing where the + * write offset might be (e.g. NFS TCP write). + */ + if (offset == -1) { + preamble = 0; + } else { + preamble = P2PHASE(offset, blksz); + if (preamble) { + preamble = blksz - preamble; + size -= preamble; + } + } + + postamble = P2PHASE(size, blksz); + size -= postamble; + + fullblk = size / blksz; + (void) dmu_xuio_init(xuio, + (preamble != 0) + fullblk + (postamble != 0)); + + /* + * Have to fix iov base/len for partial buffers. They + * currently represent full arc_buf's. + */ + if (preamble) { + /* data begins in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, + blksz - preamble, preamble); + } + + for (i = 0; i < fullblk; i++) { + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, 0, blksz); + } + + if (postamble) { + /* data ends in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, 0, postamble); + } + break; + case UIO_READ: + /* + * Loan out an arc_buf for read if the read size is larger than + * the current file block size. Block alignment is not + * considered. Partial arc_buf will be loaned out for read. + */ + blksz = zp->z_blksz; + if (blksz < zcr_blksz_min) + blksz = zcr_blksz_min; + if (blksz > zcr_blksz_max) + blksz = zcr_blksz_max; + /* avoid potential complexity of dealing with it */ + if (blksz > max_blksz) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + maxsize = zp->z_size - uio->uio_loffset; + if (size > maxsize) + size = maxsize; + + if (size < blksz) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + break; + default: + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + uio->uio_extflg = UIO_XUIO; + XUIO_XUZC_RW(xuio) = ioflag; + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static int +zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) +{ + int i; + arc_buf_t *abuf; + int ioflag = XUIO_XUZC_RW(xuio); + + ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); + + i = dmu_xuio_cnt(xuio); + while (i-- > 0) { + abuf = dmu_xuio_arcbuf(xuio, i); + /* + * if abuf == NULL, it must be a write buffer + * that has been returned in zfs_write(). + */ + if (abuf) + dmu_return_arcbuf(abuf); + ASSERT(abuf || ioflag == UIO_WRITE); + } + + dmu_xuio_fini(xuio); + return (0); +} +#endif /* HAVE_UIO_ZEROCOPY */ + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_open); +EXPORT_SYMBOL(zfs_close); +EXPORT_SYMBOL(zfs_read); +EXPORT_SYMBOL(zfs_write); +EXPORT_SYMBOL(zfs_access); +EXPORT_SYMBOL(zfs_lookup); +EXPORT_SYMBOL(zfs_create); +EXPORT_SYMBOL(zfs_tmpfile); +EXPORT_SYMBOL(zfs_remove); +EXPORT_SYMBOL(zfs_mkdir); +EXPORT_SYMBOL(zfs_rmdir); +EXPORT_SYMBOL(zfs_readdir); +EXPORT_SYMBOL(zfs_fsync); +EXPORT_SYMBOL(zfs_getattr); +EXPORT_SYMBOL(zfs_getattr_fast); +EXPORT_SYMBOL(zfs_setattr); +EXPORT_SYMBOL(zfs_rename); +EXPORT_SYMBOL(zfs_symlink); +EXPORT_SYMBOL(zfs_readlink); +EXPORT_SYMBOL(zfs_link); +EXPORT_SYMBOL(zfs_inactive); +EXPORT_SYMBOL(zfs_space); +EXPORT_SYMBOL(zfs_fid); +EXPORT_SYMBOL(zfs_getsecattr); +EXPORT_SYMBOL(zfs_setsecattr); +EXPORT_SYMBOL(zfs_getpage); +EXPORT_SYMBOL(zfs_putpage); +EXPORT_SYMBOL(zfs_dirty_inode); +EXPORT_SYMBOL(zfs_map); + +/* BEGIN CSTYLED */ +module_param(zfs_delete_blocks, ulong, 0644); +MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +module_param(zfs_read_chunk_size, ulong, 0644); +MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); +/* END CSTYLED */ + +#endif diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c new file mode 100644 index 000000000..549c701a0 --- /dev/null +++ b/module/os/linux/zfs/zfs_znode.c @@ -0,0 +1,2234 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2007 Jeremy Teo */ + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/sysmacros.h> +#include <sys/mntent.h> +#include <sys/u8_textprep.h> +#include <sys/dsl_dataset.h> +#include <sys/vfs.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/kmem.h> +#include <sys/errno.h> +#include <sys/mode.h> +#include <sys/atomic.h> +#include <sys/zfs_dir.h> +#include <sys/zfs_acl.h> +#include <sys/zfs_ioctl.h> +#include <sys/zfs_rlock.h> +#include <sys/zfs_fuid.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_ctldir.h> +#include <sys/dnode.h> +#include <sys/fs/zfs.h> +#include <sys/zpl.h> +#endif /* _KERNEL */ + +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dmu_tx.h> +#include <sys/refcount.h> +#include <sys/stat.h> +#include <sys/zap.h> +#include <sys/zfs_znode.h> +#include <sys/sa.h> +#include <sys/zfs_sa.h> +#include <sys/zfs_stat.h> + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL + +static kmem_cache_t *znode_cache = NULL; +static kmem_cache_t *znode_hold_cache = NULL; +unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; + +/* + * This is used by the test suite so that it can delay znodes from being + * freed in order to inspect the unlinked set. + */ +int zfs_unlink_suspend_progress = 0; + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +/*ARGSUSED*/ +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + inode_init_once(ZTOI(zp)); + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); + + rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; + zp->z_xattr_cached = NULL; + zp->z_xattr_parent = 0; + zp->z_moved = B_FALSE; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_parent_lock); + rw_destroy(&zp->z_name_lock); + mutex_destroy(&zp->z_acl_lock); + rw_destroy(&zp->z_xattr_lock); + rangelock_fini(&zp->z_rangelock); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); + ASSERT(zp->z_xattr_cached == NULL); +} + +static int +zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_hold_t *zh = buf; + + mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); + zfs_refcount_create(&zh->zh_refcount); + zh->zh_obj = ZFS_NO_OBJECT; + + return (0); +} + +static void +zfs_znode_hold_cache_destructor(void *buf, void *arg) +{ + znode_hold_t *zh = buf; + + mutex_destroy(&zh->zh_lock); + zfs_refcount_destroy(&zh->zh_refcount); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache. The KMC_SLAB hint is used in order that it be + * backed by kmalloc() when on the Linux slab in order that any + * wait_on_bit() operations on the related inode operate properly. + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); + + ASSERT(znode_hold_cache == NULL); + znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", + sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, + zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + + if (znode_hold_cache) + kmem_cache_destroy(znode_hold_cache); + znode_hold_cache = NULL; +} + +/* + * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to + * serialize access to a znode and its SA buffer while the object is being + * created or destroyed. This kind of locking would normally reside in the + * znode itself but in this case that's impossible because the znode and SA + * buffer may not yet exist. Therefore the locking is handled externally + * with an array of mutexs and AVLs trees which contain per-object locks. + * + * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted + * in to the correct AVL tree and finally the per-object lock is held. In + * zfs_znode_hold_exit() the process is reversed. The per-object lock is + * released, removed from the AVL tree and destroyed if there are no waiters. + * + * This scheme has two important properties: + * + * 1) No memory allocations are performed while holding one of the z_hold_locks. + * This ensures evict(), which can be called from direct memory reclaim, will + * never block waiting on a z_hold_locks which just happens to have hashed + * to the same index. + * + * 2) All locks used to serialize access to an object are per-object and never + * shared. This minimizes lock contention without creating a large number + * of dedicated locks. + * + * On the downside it does require znode_lock_t structures to be frequently + * allocated and freed. However, because these are backed by a kmem cache + * and very short lived this cost is minimal. + */ +int +zfs_znode_hold_compare(const void *a, const void *b) +{ + const znode_hold_t *zh_a = (const znode_hold_t *)a; + const znode_hold_t *zh_b = (const znode_hold_t *)b; + + return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj)); +} + +boolean_t +zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t held; + + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; + mutex_exit(&zfsvfs->z_hold_locks[i]); + + return (held); +} + +static znode_hold_t * +zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, *zh_new, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t found = B_FALSE; + + zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); + zh_new->zh_obj = obj; + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + if (likely(zh == NULL)) { + zh = zh_new; + avl_add(&zfsvfs->z_hold_trees[i], zh); + } else { + ASSERT3U(zh->zh_obj, ==, obj); + found = B_TRUE; + } + zfs_refcount_add(&zh->zh_refcount, NULL); + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (found == B_TRUE) + kmem_cache_free(znode_hold_cache, zh_new); + + ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_enter(&zh->zh_lock); + + return (zh); +} + +static void +zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) +{ + int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); + boolean_t remove = B_FALSE; + + ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_exit(&zh->zh_lock); + + mutex_enter(&zfsvfs->z_hold_locks[i]); + if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { + avl_remove(&zfsvfs->z_hold_trees[i], zh); + remove = B_TRUE; + } + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (remove == B_TRUE) + kmem_cache_free(znode_hold_cache, zh); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); + + mutex_enter(&zp->z_lock); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + mutex_exit(&zp->z_lock); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || + RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +/* + * Called by new_inode() to allocate a new inode. + */ +int +zfs_inode_alloc(struct super_block *sb, struct inode **ip) +{ + znode_t *zp; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + *ip = ZTOI(zp); + + return (0); +} + +/* + * Called in multiple places when an inode should be destroyed. + */ +void +zfs_inode_destroy(struct inode *ip) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + mutex_enter(&zfsvfs->z_znodes_lock); + if (list_link_active(&zp->z_link_node)) { + list_remove(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes--; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); +} + +static void +zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) +{ + uint64_t rdev = 0; + + switch (ip->i_mode & S_IFMT) { + case S_IFREG: + ip->i_op = &zpl_inode_operations; + ip->i_fop = &zpl_file_operations; + ip->i_mapping->a_ops = &zpl_address_space_operations; + break; + + case S_IFDIR: + ip->i_op = &zpl_dir_inode_operations; + ip->i_fop = &zpl_dir_file_operations; + ITOZ(ip)->z_zn_prefetch = B_TRUE; + break; + + case S_IFLNK: + ip->i_op = &zpl_symlink_inode_operations; + break; + + /* + * rdev is only stored in a SA only for device files. + */ + case S_IFCHR: + case S_IFBLK: + (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, + sizeof (rdev)); + /*FALLTHROUGH*/ + case S_IFIFO: + case S_IFSOCK: + init_special_inode(ip, ip->i_mode, rdev); + ip->i_op = &zpl_special_inode_operations; + break; + + default: + zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", + (u_longlong_t)ip->i_ino, ip->i_mode); + + /* Assume the inode is a file and attempt to continue */ + ip->i_mode = S_IFREG | 0644; + ip->i_op = &zpl_inode_operations; + ip->i_fop = &zpl_file_operations; + ip->i_mapping->a_ops = &zpl_address_space_operations; + break; + } +} + +void +zfs_set_inode_flags(znode_t *zp, struct inode *ip) +{ + /* + * Linux and Solaris have different sets of file attributes, so we + * restrict this conversion to the intersection of the two. + */ +#ifdef HAVE_INODE_SET_FLAGS + unsigned int flags = 0; + if (zp->z_pflags & ZFS_IMMUTABLE) + flags |= S_IMMUTABLE; + if (zp->z_pflags & ZFS_APPENDONLY) + flags |= S_APPEND; + + inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); +#else + if (zp->z_pflags & ZFS_IMMUTABLE) + ip->i_flags |= S_IMMUTABLE; + else + ip->i_flags &= ~S_IMMUTABLE; + + if (zp->z_pflags & ZFS_APPENDONLY) + ip->i_flags |= S_APPEND; + else + ip->i_flags &= ~S_APPEND; +#endif +} + +/* + * Update the embedded inode given the znode. We should work toward + * eliminating this function as soon as possible by removing values + * which are duplicated between the znode and inode. If the generic + * inode has the correct field it should be used, and the ZFS code + * updated to access the inode. This can be done incrementally. + */ +void +zfs_inode_update(znode_t *zp) +{ + zfsvfs_t *zfsvfs; + struct inode *ip; + uint32_t blksize; + u_longlong_t i_blocks; + + ASSERT(zp != NULL); + zfsvfs = ZTOZSB(zp); + ip = ZTOI(zp); + + /* Skip .zfs control nodes which do not exist on disk. */ + if (zfsctl_is_node(ip)) + return; + + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); + + spin_lock(&ip->i_lock); + ip->i_blocks = i_blocks; + i_size_write(ip, zp->z_size); + spin_unlock(&ip->i_lock); +} + + +/* + * Construct a znode+inode and initialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + struct inode *ip; + uint64_t mode; + uint64_t parent; + uint64_t tmp_gen; + uint64_t links; + uint64_t z_uid, z_gid; + uint64_t atime[2], mtime[2], ctime[2]; + uint64_t projid = ZFS_DEFAULT_PROJID; + sa_bulk_attr_t bulk[11]; + int count = 0; + + ASSERT(zfsvfs != NULL); + + ip = new_inode(zfsvfs->z_sb); + if (ip == NULL) + return (NULL); + + zp = ITOZ(ip); + ASSERT(zp->z_dirlocks == NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_moved = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_FALSE; + zp->z_is_stale = B_FALSE; + zp->z_suspended = B_FALSE; + zp->z_sa_hdl = NULL; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; + goto error; + } + + zp->z_projid = projid; + zp->z_mode = ip->i_mode = mode; + ip->i_generation = (uint32_t)tmp_gen; + ip->i_blkbits = SPA_MINBLOCKSHIFT; + set_nlink(ip, (uint32_t)links); + zfs_uid_write(ip, z_uid); + zfs_gid_write(ip, z_gid); + zfs_set_inode_flags(zp, ip); + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; + + ZFS_TIME_DECODE(&ip->i_atime, atime); + ZFS_TIME_DECODE(&ip->i_mtime, mtime); + ZFS_TIME_DECODE(&ip->i_ctime, ctime); + + ip->i_ino = zp->z_id; + zfs_inode_update(zp); + zfs_inode_set_ops(zfsvfs, ip); + + /* + * The only way insert_inode_locked() can fail is if the ip->i_ino + * number is already hashed for this super block. This can never + * happen because the inode numbers map 1:1 with the object numbers. + * + * The one exception is rolling back a mounted file system, but in + * this case all the active inode are unhashed during the rollback. + */ + VERIFY3S(insert_inode_locked(ip), ==, 0); + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + mutex_exit(&zfsvfs->z_znodes_lock); + + unlock_new_inode(ip); + return (zp); + +error: + iput(ip); + return (NULL); +} + +/* + * Safely mark an inode dirty. Inodes which are part of a read-only + * file system or snapshot may not be dirtied. + */ +void +zfs_mark_inode_dirty(struct inode *ip) +{ + zfsvfs_t *zfsvfs = ITOZSB(ip); + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return; + + mark_inode_dirty(ip); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_TMPFILE - new object is of O_TMPFILE + * IS_XATTR - new object is an attribute + * acl_ids - ACL related attributes + * + * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t projid = ZFS_DEFAULT_PROJID; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + dmu_buf_t *db; + inode_timespec_t now; + uint64_t gen, obj; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + znode_hold_t *zh; + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (S_ISDIR(vap->va_mode)) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + zh = zfs_znode_hold_enter(zfsvfs, obj); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp->z_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (S_ISDIR(vap->va_mode)) { + size = 2; /* contents ("." and "..") */ + links = 2; + } else { + size = 0; + links = (flag & IS_TMPFILE) ? 0 : 1; + } + + if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) + rdev = vap->va_rdev; + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { + /* + * With ZFS_PROJID flag, we can easily know whether there is + * project ID stored on disk or not. See zfs_space_delta_cb(). + */ + if (obj_type != DMU_OT_ZNODE && + dmu_objset_projectquota_enabled(zfsvfs->z_os)) + pflags |= ZFS_PROJID; + + /* + * Inherit project ID from parent if required. + */ + projid = zfs_inherit_projid(dzp); + if (dzp->z_pflags & ZFS_PROJINHERIT) + pflags |= ZFS_PROJINHERIT; + } + + /* + * No execs denied will be determined when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & ATTR_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & ATTR_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + pflags & ZFS_PROJID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), + NULL, &projid, 8); + } + if (obj_type == DMU_OT_ZNODE || + (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + /* + * The call to zfs_znode_alloc() may fail if memory is low + * via the call path: alloc_inode() -> inode_init_always() -> + * security_inode_alloc() -> inode_alloc_security(). Since + * the existing code is written such that zfs_mknode() can + * not fail retry until sufficient memory has been reclaimed. + */ + do { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + } while (*zpp == NULL); + + VERIFY(*zpp != NULL); + VERIFY(dzp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + (*zpp)->z_projid = projid; + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + zfs_znode_hold_exit(zfsvfs, zh); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + boolean_t update_inode = B_FALSE; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + + update_inode = B_TRUE; + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + + update_inode = B_TRUE; + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (update_inode) + zfs_set_inode_flags(zp, ZTOI(zp)); +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + znode_hold_t *zh; + int err; + sa_handle_t *hdl; + + *zpp = NULL; + +again: + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + + ASSERT3P(zp, !=, NULL); + + mutex_enter(&zp->z_lock); + ASSERT3U(zp->z_id, ==, obj_num); + /* + * If igrab() returns NULL the VFS has independently + * determined the inode should be evicted and has + * called iput_final() to start the eviction process. + * The SA handle is still valid but because the VFS + * requires that the eviction succeed we must drop + * our locks and references to allow the eviction to + * complete. The zfs_zget() may then be retried. + * + * This unlikely case could be optimized by registering + * a sops->drop_inode() callback. The callback would + * need to detect the active SA hold thereby informing + * the VFS that this inode should not be evicted. + */ + if (igrab(ZTOI(zp)) == NULL) { + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + /* inode might need this to finish evict */ + cond_resched(); + goto again; + } + *zpp = zp; + err = 0; + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + /* + * Not found create new znode/vnode but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + zfs_znode_hold_exit(zfsvfs, zh); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_object_info_t doi; + dmu_buf_t *db; + uint64_t obj_num = zp->z_id; + uint64_t mode; + uint64_t links; + sa_bulk_attr_t bulk[10]; + int err; + int count = 0; + uint64_t gen; + uint64_t z_uid, z_gid; + uint64_t atime[2], mtime[2], ctime[2]; + uint64_t projid = ZFS_DEFAULT_PROJID; + znode_hold_t *zh; + + /* + * skip ctldir, otherwise they will always get invalidated. This will + * cause funny behaviour for the mounted snapdirs. Especially for + * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent + * anyone automount it again as long as someone is still using the + * detached mount. + */ + if (zp->z_is_ctldir) + return (0); + + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + mutex_exit(&zp->z_acl_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + rw_exit(&zp->z_xattr_lock); + + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &links, sizeof (links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &z_uid, sizeof (z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &z_gid, sizeof (z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), + &projid, 8); + if (err != 0 && err != ENOENT) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(err)); + } + } + + zp->z_projid = projid; + zp->z_mode = ZTOI(zp)->i_mode = mode; + zfs_uid_write(ZTOI(zp), z_uid); + zfs_gid_write(ZTOI(zp), z_gid); + + ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); + ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); + ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + + if ((uint32_t)gen != ZTOI(zp)->i_generation) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + set_nlink(ZTOI(zp), (uint32_t)links); + zfs_set_inode_flags(zp, ZTOI(zp)); + + zp->z_blksz = doi.doi_data_block_size; + zp->z_atime_dirty = B_FALSE; + zfs_inode_update(zp); + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatic removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); + if (zp->z_unlinked) + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + znode_hold_t *zh; + + zh = zfs_znode_hold_enter(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t z_id = zp->z_id; + znode_hold_t *zh; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode. + */ + zh = zfs_znode_hold_enter(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { + mutex_exit(&zp->z_lock); + zfs_znode_hold_exit(zfsvfs, zh); + zfs_rmnode(zp); + return; + } + } + + mutex_exit(&zp->z_lock); + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); +} + +#if defined(HAVE_INODE_TIMESPEC64_TIMES) +#define zfs_compare_timespec timespec64_compare +#else +#define zfs_compare_timespec timespec_compare +#endif + +/* + * Determine whether the znode's atime must be updated. The logic mostly + * duplicates the Linux kernel's relatime_need_update() functionality. + * This function is only called if the underlying filesystem actually has + * atime updates enabled. + */ +boolean_t +zfs_relatime_need_update(const struct inode *ip) +{ + inode_timespec_t now; + + gethrestime(&now); + /* + * In relatime mode, only update the atime if the previous atime + * is earlier than either the ctime or mtime or if at least a day + * has passed since the last update of atime. + */ + if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) + return (B_TRUE); + + if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) + return (B_TRUE); + + if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Prepare to update znode time stamps. + * + * IN: zp - znode requiring timestamp update + * flag - ATTR_MTIME, ATTR_CTIME flags + * + * OUT: zp - z_seq + * mtime - new mtime + * ctime - new ctime + * + * Note: We don't update atime here, because we rely on Linux VFS to do + * atime updating. + */ +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + inode_timespec_t now; + + gethrestime(&now); + + zp->z_seq++; + + if (flag & ATTR_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); + if (ZTOZSB(zp)->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & ATTR_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); + if (ZTOZSB(zp)->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_tx_t *tx; + locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + rangelock_exit(lr); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), + &zp->z_size, sizeof (zp->z_size), tx)); + + rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * zfs_zero_partial_page - Modeled after update_pages() but + * with different arguments and semantics for use by zfs_freesp(). + * + * Zeroes a piece of a single page cache entry for zp at offset + * start and length len. + * + * Caller must acquire a range lock on the file for the region + * being zeroed in order that the ARC and page cache stay in sync. + */ +static void +zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) +{ + struct address_space *mp = ZTOI(zp)->i_mapping; + struct page *pp; + int64_t off; + void *pb; + + ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); + + off = start & (PAGE_SIZE - 1); + start &= PAGE_MASK; + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + pb = kmap(pp); + bzero(pb + off, len); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + SetPageUptodate(pp); + ClearPageError(pp); + unlock_page(pp); + put_page(pp); + } +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + /* + * Zero partial page cache entries. This must be done under a + * range lock in order to keep the ARC and page cache in sync. + */ + if (zp->z_is_mapped) { + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + + /* first possible full page in hole */ + first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* last page of hole */ + last_page = (off + len) >> PAGE_SHIFT; + + /* offset of first_page */ + first_page_offset = first_page << PAGE_SHIFT; + /* offset of last_page */ + last_page_offset = last_page << PAGE_SHIFT; + + /* truncate whole pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(ZTOI(zp)->i_mapping, + first_page_offset, last_page_offset - 1); + } + + /* truncate sub-page ranges */ + if (first_page > last_page) { + /* entire punched area within a single page */ + zfs_zero_partial_page(zp, off, len); + } else { + /* beginning of punched area at the end of a page */ + page_len = first_page_offset - off; + if (page_len > 0) + zfs_zero_partial_page(zp, off, page_len); + + /* end of punched area at the beginning of a page */ + page_len = off + len - last_page_offset; + if (page_len > 0) + zfs_zero_partial_page(zp, last_page_offset, + page_len); + } + } + rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_tx_t *tx; + locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + rangelock_exit(lr); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + goto out; + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + goto out; +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + + zfs_inode_update(zp); + error = 0; + +out: + /* + * Truncate the page cache - for file truncate operations, use + * the purpose-built API for truncations. For punching operations, + * the truncation is handled under a range lock in zfs_free_range. + */ + if (len == 0) + truncate_setsize(ZTOI(zp), off); + return (error); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + struct super_block *sb; + zfsvfs_t *zfsvfs; + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int size; + int error; + int i; + znode_t *rootzp = NULL; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/inode/zfsvfs/sb + * to allow zfs_mknode to work. + */ + vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + rootzp->z_unlinked = B_FALSE; + rootzp->z_atime_dirty = B_FALSE; + rootzp->z_moved = B_FALSE; + rootzp->z_is_sa = USE_SA(version, os); + rootzp->z_pflags = 0; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); + sb->s_fs_info = zfsvfs; + + ZTOI(rootzp)->i_sb = sb; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + + atomic_set(&ZTOI(rootzp)->i_count, 0); + sa_handle_destroy(rootzp->z_sa_hdl); + kmem_cache_free(znode_cache, rootzp); + + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + + mutex_destroy(&zfsvfs->z_znodes_lock); + + vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + kmem_free(sb, sizeof (struct super_block)); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj = 0; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir = 0; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, "<xattrdir>"); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_create_fs); +EXPORT_SYMBOL(zfs_obj_to_path); + +/* CSTYLED */ +module_param(zfs_object_mutex_size, uint, 0644); +MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); +module_param(zfs_unlink_suspend_progress, int, 0644); +MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " +"(debug - leaks space into the unlinked set)"); +#endif diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c new file mode 100644 index 000000000..486622c8a --- /dev/null +++ b/module/os/linux/zfs/zio_crypt.c @@ -0,0 +1,2036 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include <sys/zio_crypt.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dnode.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/zil.h> +#include <sys/sha2.h> +#include <sys/hkdf.h> +#include <sys/qat.h> + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authentication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearranged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpreted. + * + * At the objset level, we maintain 2 separate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintained in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaintext as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC + * here so that a reproducible checksum of the plaintext is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
+ uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + crypto_destroy_ctx_template(key->zk_hmac_tmpl); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uint_t keydata_len; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech; + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + /* destroy the old context template and create the new one */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the ciphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +static int +zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, + crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, + uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) +{ + int ret; + crypto_data_t plaindata, cipherdata; + CK_AES_CCM_PARAMS ccmp; + CK_AES_GCM_PARAMS gcmp; + crypto_mechanism_t mech; + zio_crypt_info_t crypt_info; + uint_t plain_full_len, maclen; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); + + /* lookup the encryption info */ + crypt_info = zio_crypt_table[crypt]; + + /* the mac will always be the last iovec_t in the cipher uio */ + maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; + + ASSERT(maclen <= ZIO_DATA_MAC_LEN); + + /* setup encryption mechanism (same as crypt) */ + mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); + + /* + * Strangely, the ICP requires that plain_full_len must include + * the MAC length when decrypting, even though the UIO does not + * need to have the extra space allocated. + */ + if (encrypt) { + plain_full_len = datalen; + } else { + plain_full_len = datalen + maclen; + } + + /* + * setup encryption params (currently only AES CCM and AES GCM + * are supported) + */ + if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { + ccmp.ulNonceSize = ZIO_DATA_IV_LEN; + ccmp.ulAuthDataSize = auth_len; + ccmp.authData = authbuf; + ccmp.ulMACSize = maclen; + ccmp.nonce = ivbuf; + ccmp.ulDataSize = plain_full_len; + + mech.cm_param = (char *)(&ccmp); + mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); + } else { + gcmp.ulIvLen = ZIO_DATA_IV_LEN; + gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); + gcmp.ulAADLen = auth_len; + gcmp.pAAD = authbuf; + gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); + gcmp.pIv = ivbuf; + + mech.cm_param = (char *)(&gcmp); + mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + } + + /* populate the cipher and plain data structs. */ + plaindata.cd_format = CRYPTO_DATA_UIO; + plaindata.cd_offset = 0; + plaindata.cd_uio = puio; + plaindata.cd_miscdata = NULL; + plaindata.cd_length = plain_full_len; + + cipherdata.cd_format = CRYPTO_DATA_UIO; + cipherdata.cd_offset = 0; + cipherdata.cd_uio = cuio; + cipherdata.cd_miscdata = NULL; + cipherdata.cd_length = datalen + maclen; + + /* perform the actual encryption */ + if (encrypt) { + ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + } else { + ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); + ret = SET_ERROR(ECKSUM); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uio_t puio, cuio; + uint64_t aad[3]; + iovec_t plain_iovecs[2], cipher_iovecs[3]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* initialize uio_ts */ + plain_iovecs[0].iov_base = key->zk_master_keydata; + plain_iovecs[0].iov_len = keydata_len; + plain_iovecs[1].iov_base = key->zk_hmac_keydata; + plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + + cipher_iovecs[0].iov_base = keydata_out; + cipher_iovecs[0].iov_len = keydata_len; + cipher_iovecs[1].iov_base = hmac_keydata_out; + cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + cipher_iovecs[2].iov_base = mac; + cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + puio.uio_iov = plain_iovecs; + puio.uio_iovcnt = 2; + puio.uio_segflg = UIO_SYSSPACE; + cuio.uio_iov = cipher_iovecs; + cuio.uio_iovcnt = 3; + cuio.uio_segflg = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, + &puio, &cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uio_t puio, cuio; + uint64_t aad[3]; + iovec_t plain_iovecs[2], cipher_iovecs[3]; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* initialize uio_ts */ + plain_iovecs[0].iov_base = key->zk_master_keydata; + plain_iovecs[0].iov_len = keydata_len; + plain_iovecs[1].iov_base = key->zk_hmac_keydata; + plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + + cipher_iovecs[0].iov_base = keydata; + cipher_iovecs[0].iov_len = keydata_len; + cipher_iovecs[1].iov_base = hmac_keydata; + cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + cipher_iovecs[2].iov_base = mac; + cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + puio.uio_iov = plain_iovecs; + puio.uio_segflg = UIO_SYSSPACE; + puio.uio_iovcnt = 2; + cuio.uio_iov = cipher_iovecs; + cuio.uio_iovcnt = 3; + cuio.uio_segflg = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, + &puio, &cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + int ret; + crypto_mechanism_t mech; + crypto_data_t in_data, digest_data; + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + /* initialize sha512-hmac mechanism and crypto data */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the crypto data */ + in_data.cd_format = CRYPTO_DATA_RAW; + in_data.cd_offset = 0; + in_data.cd_length = datalen; + in_data.cd_raw.iov_base = (char *)data; + in_data.cd_raw.iov_len = in_data.cd_length; + + digest_data.cd_format = CRYPTO_DATA_RAW; + digest_data.cd_offset = 0; + digest_data.cd_length = SHA512_DIGEST_LENGTH; + digest_data.cd_raw.iov_base = (char *)raw_digestbuf; + digest_data.cd_raw.iov_len = digest_data.cd_length; + + /* generate the hmac */ + ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, + &digest_data, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); + +error: + bzero(digestbuf, digestlen); + return (ret); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpreted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + int ret; + uint_t bab_len; + blkptr_auth_buf_t bab; + crypto_data_t cd; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + cd.cd_length = bab_len; + cd.cd_raw.iov_base = (char *)&bab; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + return (0); + +error: + return (ret); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + crypto_data_t cd; + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + cd.cd_length = sizeof (tmp_dncore); + cd.cd_raw.iov_base = (char *)adnp; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 separate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_data_t cd; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* calculate the portable MAC from the portable fields and metadnode */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_portable_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user, group and project accounting. + * If these objects are not present, the local MAC is zeroed out. + */ + if ((datalen >= OBJSET_PHYS_SIZE_V3 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE && + osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || + (datalen >= OBJSET_PHYS_SIZE_V2 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_PHYS_SIZE_V1)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the user accounting dnodes */ + if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + } + + if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + } + + if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && + datalen >= OBJSET_PHYS_SIZE_V3) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_projectused_dnode); + if (ret) + goto error; + } + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_local_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ + if (uio->uio_iov) + kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, + uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + /* allocate the iovec arrays */ + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + nr_iovecs = 0; + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + ASSERT3P(src_iovecs, !=, NULL); + ASSERT3P(dst_iovecs, !=, NULL); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + src_iovecs[nr_iovecs].iov_base = + slrp + sizeof (lr_write_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = + dlrp + sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + puio->uio_iov = src_iovecs; + puio->uio_iovcnt = nr_src; + cuio->uio_iov = dst_iovecs; + cuio->uio_iovcnt = nr_dst; + } else { + puio->uio_iov = dst_iovecs; + puio->uio_iovcnt = nr_dst; + cuio->uio_iov = src_iovecs; + cuio->uio_iovcnt = nr_src; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + nr_iovecs = 0; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_src); + ASSERT3U(nr_iovecs, <, nr_dst); + ASSERT3P(src_iovecs, !=, NULL); + ASSERT3P(dst_iovecs, !=, NULL); + src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + puio->uio_iov = src_iovecs; + puio->uio_iovcnt = nr_src; + cuio->uio_iov = dst_iovecs; + cuio->uio_iovcnt = nr_dst; + } else { + puio->uio_iov = dst_iovecs; + puio->uio_iovcnt = nr_dst; + cuio->uio_iov = src_iovecs; + cuio->uio_iovcnt = nr_src; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + + /* allocate the iovecs for the plain and cipher data */ + plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), + KM_SLEEP); + if (!plain_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + plain_iovecs[0].iov_base = plainbuf; + plain_iovecs[0].iov_len = datalen; + cipher_iovecs[0].iov_base = cipherbuf; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + puio->uio_iov = plain_iovecs; + puio->uio_iovcnt = nr_plain; + cuio->uio_iov = cipher_iovecs; + cuio->uio_iovcnt = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + puio->uio_segflg = UIO_SYSSPACE; + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + uio_t puio, cuio; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + crypto_ctx_template_t tmpl; + uint8_t *authbuf = NULL; + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = key->zk_current_tmpl; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* + * Attempt to use QAT acceleration if we can. We currently don't + * do this for metadnode and ZIL blocks, since they have a much + * more involved buffer layout and the qat_crypt() function only + * works in-place. + */ + if (qat_crypt_use_accel(datalen) && + ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { + uint8_t *srcbuf, *dstbuf; + + if (encrypt) { + srcbuf = plainbuf; + dstbuf = cipherbuf; + } else { + srcbuf = cipherbuf; + dstbuf = plainbuf; + } + + ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, + dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); + if (ret == CPA_STATUS_SUCCESS) { + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + return (0); + } + /* If the hardware implementation fails fall back to software */ + } + + bzero(&puio, sizeof (uio_t)); + bzero(&cuio, sizeof (uio_t)); + + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + goto error; + + /* perform the encryption / decryption in software */ + ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, + &puio, &cuio, authbuf, auth_len); + if (ret != 0) + goto error; + + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (ret); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (ret); +} + +#if defined(_KERNEL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c new file mode 100644 index 000000000..6df367b81 --- /dev/null +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -0,0 +1,572 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri <[email protected]> + * Brian Behlendorf <[email protected]> + */ + +#include <sys/zfs_vfsops.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_ctldir.h> +#include <sys/zpl.h> + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zpl_common_open(struct inode *ip, struct file *filp) +{ + if (filp->f_mode & FMODE_WRITE) + return (-EACCES); + + return (generic_file_open(ip, filp)); +} + +/* + * Get root directory contents. + */ +static int +zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); + int error = 0; + + ZFS_ENTER(zfsvfs); + + if (!zpl_dir_emit_dots(filp, ctx)) + goto out; + + if (ctx->pos == 2) { + if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME, + strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR)) + goto out; + + ctx->pos++; + } + + if (ctx->pos == 3) { + if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME, + strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR)) + goto out; + + ctx->pos++; + } +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_root_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zpl_root_getattr_impl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *ip = path->dentry->d_inode; + + generic_fillattr(ip, stat); + stat->atime = current_time(ip); + + return (0); +} +ZPL_GETATTR_WRAPPER(zpl_root_getattr); + +static struct dentry * +#ifdef HAVE_LOOKUP_NAMEIDATA +zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd) +#else +zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) +#endif +{ + cred_t *cr = CRED(); + struct inode *ip; + int error; + + crhold(cr); + error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); + ASSERT3S(error, <=, 0); + crfree(cr); + + if (error) { + if (error == -ENOENT) + return (d_splice_alias(NULL, dentry)); + else + return (ERR_PTR(error)); + } + + return (d_splice_alias(ip, dentry)); +} + +/* + * The '.zfs' control directory file and inode operations. + */ +const struct file_operations zpl_fops_root = { + .open = zpl_common_open, + .llseek = generic_file_llseek, + .read = generic_read_dir, +#ifdef HAVE_VFS_ITERATE_SHARED + .iterate_shared = zpl_root_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_root_iterate, +#else + .readdir = zpl_root_readdir, +#endif +}; + +const struct inode_operations zpl_ops_root = { + .lookup = zpl_root_lookup, + .getattr = zpl_root_getattr, +}; + +#ifdef HAVE_AUTOMOUNT +static struct vfsmount * +zpl_snapdir_automount(struct path *path) +{ + int error; + + error = -zfsctl_snapshot_mount(path, 0); + if (error) + return (ERR_PTR(error)); + + /* + * Rather than returning the new vfsmount for the snapshot we must + * return NULL to indicate a mount collision. This is done because + * the user space mount calls do_add_mount() which adds the vfsmount + * to the name space. If we returned the new mount here it would be + * added again to the vfsmount list resulting in list corruption. + */ + return (NULL); +} +#endif /* HAVE_AUTOMOUNT */ + +/* + * Negative dentries must always be revalidated so newly created snapshots + * can be detected and automounted. Normal dentries should be kept because + * as of the 3.18 kernel revaliding the mountpoint dentry will result in + * the snapshot being immediately unmounted. + */ +static int +#ifdef HAVE_D_REVALIDATE_NAMEIDATA +zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i) +#else +zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) +#endif +{ + return (!!dentry->d_inode); +} + +dentry_operations_t zpl_dops_snapdirs = { +/* + * Auto mounting of snapshots is only supported for 2.6.37 and + * newer kernels. Prior to this kernel the ops->follow_link() + * callback was used as a hack to trigger the mount. The + * resulting vfsmount was then explicitly grafted in to the + * name space. While it might be possible to add compatibility + * code to accomplish this it would require considerable care. + */ +#ifdef HAVE_AUTOMOUNT + .d_automount = zpl_snapdir_automount, +#endif /* HAVE_AUTOMOUNT */ + .d_revalidate = zpl_snapdir_revalidate, +}; + +static struct dentry * +#ifdef HAVE_LOOKUP_NAMEIDATA +zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, + struct nameidata *nd) +#else +zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, + unsigned int flags) +#endif + +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + struct inode *ip = NULL; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, + 0, cr, NULL, NULL); + ASSERT3S(error, <=, 0); + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error && error != -ENOENT) + return (ERR_PTR(error)); + + ASSERT(error == 0 || ip == NULL); + d_clear_d_op(dentry); + d_set_d_op(dentry, &zpl_dops_snapdirs); +#ifdef HAVE_AUTOMOUNT + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; +#endif + + return (d_splice_alias(ip, dentry)); +} + +static int +zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); + fstrans_cookie_t cookie; + char snapname[MAXNAMELEN]; + boolean_t case_conflict; + uint64_t id, pos; + int error = 0; + + ZFS_ENTER(zfsvfs); + cookie = spl_fstrans_mark(); + + if (!zpl_dir_emit_dots(filp, ctx)) + goto out; + + pos = ctx->pos; + while (error == 0) { + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, + snapname, &id, &pos, &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error) + goto out; + + if (!zpl_dir_emit(ctx, snapname, strlen(snapname), + ZFSCTL_INO_SHARES - id, DT_DIR)) + goto out; + + ctx->pos = pos; + } +out: + spl_fstrans_unmark(cookie); + ZFS_EXIT(zfsvfs); + + if (error == -ENOENT) + return (0); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_snapdir_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +static int +zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry, unsigned int flags) +{ + cred_t *cr = CRED(); + int error; + + /* We probably don't want to support renameat2(2) in ctldir */ + if (flags) + return (-EINVAL); + + crhold(cr); + error = -zfsctl_snapdir_rename(sdip, dname(sdentry), + tdip, dname(tdentry), cr, 0); + ASSERT3S(error, <=, 0); + crfree(cr); + + return (error); +} + +#ifndef HAVE_RENAME_WANTS_FLAGS +static int +zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry) +{ + return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0)); +} +#endif + +static int +zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) +{ + cred_t *cr = CRED(); + int error; + + crhold(cr); + error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); + ASSERT3S(error, <=, 0); + crfree(cr); + + return (error); +} + +static int +zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode) +{ + cred_t *cr = CRED(); + vattr_t *vap; + struct inode *ip; + int error; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dip, mode | S_IFDIR, cr); + + error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); + if (error == 0) { + d_clear_d_op(dentry); + d_set_d_op(dentry, &zpl_dops_snapdirs); + d_instantiate(dentry, ip); + } + + kmem_free(vap, sizeof (vattr_t)); + ASSERT3S(error, <=, 0); + crfree(cr); + + return (error); +} + +/* + * Get snapshot directory attributes. + */ +/* ARGSUSED */ +static int +zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *ip = path->dentry->d_inode; + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + generic_fillattr(ip, stat); + + stat->nlink = stat->size = 2; + stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); + stat->atime = current_time(ip); + ZFS_EXIT(zfsvfs); + + return (0); +} +ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr); + +/* + * The '.zfs/snapshot' directory file operations. These mainly control + * generating the list of available snapshots when doing an 'ls' in the + * directory. See zpl_snapdir_readdir(). + */ +const struct file_operations zpl_fops_snapdir = { + .open = zpl_common_open, + .llseek = generic_file_llseek, + .read = generic_read_dir, +#ifdef HAVE_VFS_ITERATE_SHARED + .iterate_shared = zpl_snapdir_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_snapdir_iterate, +#else + .readdir = zpl_snapdir_readdir, +#endif + +}; + +/* + * The '.zfs/snapshot' directory inode operations. These mainly control + * creating an inode for a snapshot directory and initializing the needed + * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). + */ +const struct inode_operations zpl_ops_snapdir = { + .lookup = zpl_snapdir_lookup, + .getattr = zpl_snapdir_getattr, +#ifdef HAVE_RENAME_WANTS_FLAGS + .rename = zpl_snapdir_rename2, +#else + .rename = zpl_snapdir_rename, +#endif + .rmdir = zpl_snapdir_rmdir, + .mkdir = zpl_snapdir_mkdir, +}; + +static struct dentry * +#ifdef HAVE_LOOKUP_NAMEIDATA +zpl_shares_lookup(struct inode *dip, struct dentry *dentry, + struct nameidata *nd) +#else +zpl_shares_lookup(struct inode *dip, struct dentry *dentry, + unsigned int flags) +#endif +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + struct inode *ip = NULL; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, + 0, cr, NULL, NULL); + ASSERT3S(error, <=, 0); + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error) { + if (error == -ENOENT) + return (d_splice_alias(NULL, dentry)); + else + return (ERR_PTR(error)); + } + + return (d_splice_alias(ip, dentry)); +} + +static int +zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); + znode_t *dzp; + int error = 0; + + ZFS_ENTER(zfsvfs); + cookie = spl_fstrans_mark(); + + if (zfsvfs->z_shares_dir == 0) { + zpl_dir_emit_dots(filp, ctx); + goto out; + } + + error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); + if (error) + goto out; + + crhold(cr); + error = -zfs_readdir(ZTOI(dzp), ctx, cr); + crfree(cr); + + iput(ZTOI(dzp)); +out: + spl_fstrans_unmark(cookie); + ZFS_EXIT(zfsvfs); + ASSERT3S(error, <=, 0); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_shares_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +/* ARGSUSED */ +static int +zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *ip = path->dentry->d_inode; + zfsvfs_t *zfsvfs = ITOZSB(ip); + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + generic_fillattr(path->dentry->d_inode, stat); + stat->nlink = stat->size = 2; + stat->atime = current_time(ip); + ZFS_EXIT(zfsvfs); + return (0); + } + + error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); + if (error == 0) { + error = -zfs_getattr_fast(ZTOI(dzp), stat); + iput(ZTOI(dzp)); + } + + ZFS_EXIT(zfsvfs); + ASSERT3S(error, <=, 0); + + return (error); +} +ZPL_GETATTR_WRAPPER(zpl_shares_getattr); + +/* + * The '.zfs/shares' directory file operations. + */ +const struct file_operations zpl_fops_shares = { + .open = zpl_common_open, + .llseek = generic_file_llseek, + .read = generic_read_dir, +#ifdef HAVE_VFS_ITERATE_SHARED + .iterate_shared = zpl_shares_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_shares_iterate, +#else + .readdir = zpl_shares_readdir, +#endif + +}; + +/* + * The '.zfs/shares' directory inode operations. + */ +const struct inode_operations zpl_ops_shares = { + .lookup = zpl_shares_lookup, + .getattr = zpl_shares_getattr, +}; diff --git a/module/os/linux/zfs/zpl_export.c b/module/os/linux/zfs/zpl_export.c new file mode 100644 index 000000000..a264d664c --- /dev/null +++ b/module/os/linux/zfs/zpl_export.c @@ -0,0 +1,177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + */ + + +#include <sys/zfs_vnops.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_ctldir.h> +#include <sys/zpl.h> + + +static int +#ifdef HAVE_ENCODE_FH_WITH_INODE +zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent) +{ +#else +zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) +{ + /* CSTYLED */ + struct inode *ip = dentry->d_inode; +#endif /* HAVE_ENCODE_FH_WITH_INODE */ + fstrans_cookie_t cookie; + fid_t *fid = (fid_t *)fh; + int len_bytes, rc; + + len_bytes = *max_len * sizeof (__u32); + + if (len_bytes < offsetof(fid_t, fid_data)) + return (255); + + fid->fid_len = len_bytes - offsetof(fid_t, fid_data); + cookie = spl_fstrans_mark(); + + if (zfsctl_is_node(ip)) + rc = zfsctl_fid(ip, fid); + else + rc = zfs_fid(ip, fid); + + spl_fstrans_unmark(cookie); + len_bytes = offsetof(fid_t, fid_data) + fid->fid_len; + *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32); + + return (rc == 0 ? FILEID_INO32_GEN : 255); +} + +static struct dentry * +zpl_dentry_obtain_alias(struct inode *ip) +{ + struct dentry *result; + +#ifdef HAVE_D_OBTAIN_ALIAS + result = d_obtain_alias(ip); +#else + result = d_alloc_anon(ip); + + if (result == NULL) { + iput(ip); + result = ERR_PTR(-ENOMEM); + } +#endif /* HAVE_D_OBTAIN_ALIAS */ + + return (result); +} + +static struct dentry * +zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + fid_t *fid = (fid_t *)fh; + fstrans_cookie_t cookie; + struct inode *ip; + int len_bytes, rc; + + len_bytes = fh_len * sizeof (__u32); + + if (fh_type != FILEID_INO32_GEN || + len_bytes < offsetof(fid_t, fid_data) || + len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) + return (ERR_PTR(-EINVAL)); + + cookie = spl_fstrans_mark(); + rc = zfs_vget(sb, &ip, fid); + spl_fstrans_unmark(cookie); + + if (rc) { + /* + * If we see ENOENT it might mean that an NFSv4 * client + * is using a cached inode value in a file handle and + * that the sought after file has had its inode changed + * by a third party. So change the error to ESTALE + * which will trigger a full lookup by the client and + * will find the new filename/inode pair if it still + * exists. + */ + if (rc == ENOENT) + rc = ESTALE; + + return (ERR_PTR(-rc)); + } + + ASSERT((ip != NULL) && !IS_ERR(ip)); + + return (zpl_dentry_obtain_alias(ip)); +} + +static struct dentry * +zpl_get_parent(struct dentry *child) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct inode *ip; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + if (error) + return (ERR_PTR(error)); + + return (zpl_dentry_obtain_alias(ip)); +} + +#ifdef HAVE_COMMIT_METADATA +static int +zpl_commit_metadata(struct inode *inode) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int error; + + if (zfsctl_is_node(inode)) + return (0); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(inode, 0, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} +#endif /* HAVE_COMMIT_METADATA */ + +const struct export_operations zpl_export_operations = { + .encode_fh = zpl_encode_fh, + .fh_to_dentry = zpl_fh_to_dentry, + .get_parent = zpl_get_parent, +#ifdef HAVE_COMMIT_METADATA + .commit_metadata = zpl_commit_metadata, +#endif /* HAVE_COMMIT_METADATA */ +}; diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c new file mode 100644 index 000000000..acad4670d --- /dev/null +++ b/module/os/linux/zfs/zpl_file.c @@ -0,0 +1,1075 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> +#endif +#include <sys/file.h> +#include <sys/dmu_objset.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_project.h> + + +static int +zpl_open(struct inode *ip, struct file *filp) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + error = generic_file_open(ip, filp); + if (error) + return (error); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_release(struct inode *ip, struct file *filp) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + if (ITOZ(ip)->z_atime_dirty) + zfs_mark_inode_dirty(ip); + + crhold(cr); + error = -zfs_close(ip, filp->f_flags, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_readdir(file_inode(filp), ctx, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +#if defined(HAVE_FSYNC_WITH_DENTRY) +/* + * Linux 2.6.x - 2.6.34 API, + * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *' + * to the fops->fsync() hook. For this reason, we must be careful not to + * use filp unconditionally. + */ +static int +zpl_fsync(struct file *filp, struct dentry *dentry, int datasync) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(dentry->d_inode, datasync, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_FILE_AIO_FSYNC +static int +zpl_aio_fsync(struct kiocb *kiocb, int datasync) +{ + struct file *filp = kiocb->ki_filp; + return (zpl_fsync(filp, file_dentry(filp), datasync)); +} +#endif + +#elif defined(HAVE_FSYNC_WITHOUT_DENTRY) +/* + * Linux 2.6.35 - 3.0 API, + * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed + * redundant. The dentry is still accessible via filp->f_path.dentry, + * and we are guaranteed that filp will never be NULL. + */ +static int +zpl_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(inode, datasync, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_FILE_AIO_FSYNC +static int +zpl_aio_fsync(struct kiocb *kiocb, int datasync) +{ + return (zpl_fsync(kiocb->ki_filp, datasync)); +} +#endif + +#elif defined(HAVE_FSYNC_RANGE) +/* + * Linux 3.1 - 3.x API, + * As of 3.1 the responsibility to call filemap_write_and_wait_range() has + * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex + * lock is no longer held by the caller, for zfs we don't require the lock + * to be held so we don't acquire it. + */ +static int +zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return (error); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(inode, datasync, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_FILE_AIO_FSYNC +static int +zpl_aio_fsync(struct kiocb *kiocb, int datasync) +{ + return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); +} +#endif + +#else +#error "Unsupported fops->fsync() implementation" +#endif + +static inline int +zfs_io_flags(struct kiocb *kiocb) +{ + int flags = 0; + +#if defined(IOCB_DSYNC) + if (kiocb->ki_flags & IOCB_DSYNC) + flags |= FDSYNC; +#endif +#if defined(IOCB_SYNC) + if (kiocb->ki_flags & IOCB_SYNC) + flags |= FSYNC; +#endif +#if defined(IOCB_APPEND) + if (kiocb->ki_flags & IOCB_APPEND) + flags |= FAPPEND; +#endif +#if defined(IOCB_DIRECT) + if (kiocb->ki_flags & IOCB_DIRECT) + flags |= FDIRECT; +#endif + return (flags); +} + +static ssize_t +zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, + unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr, size_t skip) +{ + ssize_t read; + uio_t uio = { { 0 }, 0 }; + int error; + fstrans_cookie_t cookie; + + uio.uio_iov = iovp; + uio.uio_iovcnt = nr_segs; + uio.uio_loffset = *ppos; + uio.uio_segflg = segment; + uio.uio_limit = MAXOFFSET_T; + uio.uio_resid = count; + uio.uio_skip = skip; + + cookie = spl_fstrans_mark(); + error = -zfs_read(ip, &uio, flags, cr); + spl_fstrans_unmark(cookie); + if (error < 0) + return (error); + + read = count - uio.uio_resid; + *ppos += read; + + return (read); +} + +inline ssize_t +zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, + uio_seg_t segment, int flags, cred_t *cr) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, + flags, cr, 0)); +} + +static ssize_t +zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); + ssize_t read; + unsigned int f_flags = filp->f_flags; + + f_flags |= zfs_io_flags(kiocb); + crhold(cr); + read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, + nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); + crfree(cr); + + /* + * If relatime is enabled, call file_accessed() only if + * zfs_relatime_need_update() is true. This is needed since datasets + * with inherited "relatime" property aren't necessarily mounted with + * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what + * relatime test in VFS by relatime_need_update() is based on. + */ + if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { + if (zfs_relatime_need_update(ip)) + file_accessed(filp); + } else { + file_accessed(filp); + } + + return (read); +} + +#if defined(HAVE_VFS_RW_ITERATE) +static ssize_t +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +{ + ssize_t ret; + uio_seg_t seg = UIO_USERSPACE; + if (to->type & ITER_KVEC) + seg = UIO_SYSSPACE; + if (to->type & ITER_BVEC) + seg = UIO_BVEC; + ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, + iov_iter_count(to), seg, to->iov_offset); + if (ret > 0) + iov_iter_advance(to, ret); + return (ret); +} +#else +static ssize_t +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, loff_t pos) +{ + ssize_t ret; + size_t count; + + ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); + if (ret) + return (ret); + + return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, + UIO_USERSPACE, 0)); +} +#endif /* HAVE_VFS_RW_ITERATE */ + +static ssize_t +zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, + unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr, size_t skip) +{ + ssize_t wrote; + uio_t uio = { { 0 }, 0 }; + int error; + fstrans_cookie_t cookie; + + if (flags & O_APPEND) + *ppos = i_size_read(ip); + + uio.uio_iov = iovp; + uio.uio_iovcnt = nr_segs; + uio.uio_loffset = *ppos; + uio.uio_segflg = segment; + uio.uio_limit = MAXOFFSET_T; + uio.uio_resid = count; + uio.uio_skip = skip; + + cookie = spl_fstrans_mark(); + error = -zfs_write(ip, &uio, flags, cr); + spl_fstrans_unmark(cookie); + if (error < 0) + return (error); + + wrote = count - uio.uio_resid; + *ppos += wrote; + + return (wrote); +} + +inline ssize_t +zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, + uio_seg_t segment, int flags, cred_t *cr) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, + flags, cr, 0)); +} + +static ssize_t +zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + ssize_t wrote; + unsigned int f_flags = filp->f_flags; + + f_flags |= zfs_io_flags(kiocb); + crhold(cr); + wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, + nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); + crfree(cr); + + return (wrote); +} + +#if defined(HAVE_VFS_RW_ITERATE) +static ssize_t +zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) +{ + size_t count; + ssize_t ret; + uio_seg_t seg = UIO_USERSPACE; + +#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); + + count = iov_iter_count(from); + ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); + if (ret) + return (ret); +#else + /* + * XXX - ideally this check should be in the same lock region with + * write operations, so that there's no TOCTTOU race when doing + * append and someone else grow the file. + */ + ret = generic_write_checks(kiocb, from); + if (ret <= 0) + return (ret); + count = ret; +#endif + + if (from->type & ITER_KVEC) + seg = UIO_SYSSPACE; + if (from->type & ITER_BVEC) + seg = UIO_BVEC; + + ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, + count, seg, from->iov_offset); + if (ret > 0) + iov_iter_advance(from, ret); + + return (ret); +} +#else +static ssize_t +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); + if (ret) + return (ret); + + ret = generic_write_checks(file, &pos, &count, isblk); + if (ret) + return (ret); + + return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, + UIO_USERSPACE, 0)); +} +#endif /* HAVE_VFS_RW_ITERATE */ + +#if defined(HAVE_VFS_RW_ITERATE) +static ssize_t +zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) +{ + if (rw == WRITE) + return (zpl_iter_write(kiocb, iter)); + else + return (zpl_iter_read(kiocb, iter)); +} +#if defined(HAVE_VFS_DIRECT_IO_ITER) +static ssize_t +zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) +{ + return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) +static ssize_t +zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + ASSERT3S(pos, ==, kiocb->ki_pos); + return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + ASSERT3S(pos, ==, kiocb->ki_pos); + return (zpl_direct_IO_impl(rw, kiocb, iter)); +} +#else +#error "Unknown direct IO interface" +#endif + +#else + +#if defined(HAVE_VFS_DIRECT_IO_IOVEC) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, + loff_t pos, unsigned long nr_segs) +{ + if (rw == WRITE) + return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); + else + return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); +} +#else +#error "Unknown direct IO interface" +#endif + +#endif /* HAVE_VFS_RW_ITERATE */ + +static loff_t +zpl_llseek(struct file *filp, loff_t offset, int whence) +{ +#if defined(SEEK_HOLE) && defined(SEEK_DATA) + fstrans_cookie_t cookie; + + if (whence == SEEK_DATA || whence == SEEK_HOLE) { + struct inode *ip = filp->f_mapping->host; + loff_t maxbytes = ip->i_sb->s_maxbytes; + loff_t error; + + spl_inode_lock_shared(ip); + cookie = spl_fstrans_mark(); + error = -zfs_holey(ip, whence, &offset); + spl_fstrans_unmark(cookie); + if (error == 0) + error = lseek_execute(filp, ip, offset, maxbytes); + spl_inode_unlock_shared(ip); + + return (error); + } +#endif /* SEEK_HOLE && SEEK_DATA */ + + return (generic_file_llseek(filp, offset, whence)); +} + +/* + * It's worth taking a moment to describe how mmap is implemented + * for zfs because it differs considerably from other Linux filesystems. + * However, this issue is handled the same way under OpenSolaris. + * + * The issue is that by design zfs bypasses the Linux page cache and + * leaves all caching up to the ARC. This has been shown to work + * well for the common read(2)/write(2) case. However, mmap(2) + * is problem because it relies on being tightly integrated with the + * page cache. To handle this we cache mmap'ed files twice, once in + * the ARC and a second time in the page cache. The code is careful + * to keep both copies synchronized. + * + * When a file with an mmap'ed region is written to using write(2) + * both the data in the ARC and existing pages in the page cache + * are updated. For a read(2) data will be read first from the page + * cache then the ARC if needed. Neither a write(2) or read(2) will + * will ever result in new pages being added to the page cache. + * + * New pages are added to the page cache only via .readpage() which + * is called when the vfs needs to read a page off disk to back the + * virtual memory region. These pages may be modified without + * notifying the ARC and will be written out periodically via + * .writepage(). This will occur due to either a sync or the usual + * page aging behavior. Note because a read(2) of a mmap'ed file + * will always check the page cache first even when the ARC is out + * of date correct data will still be returned. + * + * While this implementation ensures correct behavior it does have + * have some drawbacks. The most obvious of which is that it + * increases the required memory footprint when access mmap'ed + * files. It also adds additional complexity to the code keeping + * both caches synchronized. + * + * Longer term it may be possible to cleanly resolve this wart by + * mapping page cache pages directly on to the ARC buffers. The + * Linux address space operations are flexible enough to allow + * selection of which pages back a particular index. The trick + * would be working out the details of which subsystem is in + * charge, the ARC, the page cache, or both. It may also prove + * helpful to move the ARC buffers to a scatter-gather lists + * rather than a vmalloc'ed region. + */ +static int +zpl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct inode *ip = filp->f_mapping->host; + znode_t *zp = ITOZ(ip); + int error; + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, + (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); + spl_fstrans_unmark(cookie); + if (error) + return (error); + + error = generic_file_mmap(filp, vma); + if (error) + return (error); + + mutex_enter(&zp->z_lock); + zp->z_is_mapped = B_TRUE; + mutex_exit(&zp->z_lock); + + return (error); +} + +/* + * Populate a page with data for the Linux page cache. This function is + * only used to support mmap(2). There will be an identical copy of the + * data in the ARC which is kept up to date via .write() and .writepage(). + * + * Current this function relies on zpl_read_common() and the O_DIRECT + * flag to read in a page. This works but the more correct way is to + * update zfs_fillpage() to be Linux friendly and use that interface. + */ +static int +zpl_readpage(struct file *filp, struct page *pp) +{ + struct inode *ip; + struct page *pl[1]; + int error = 0; + fstrans_cookie_t cookie; + + ASSERT(PageLocked(pp)); + ip = pp->mapping->host; + pl[0] = pp; + + cookie = spl_fstrans_mark(); + error = -zfs_getpage(ip, pl, 1); + spl_fstrans_unmark(cookie); + + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + flush_dcache_page(pp); + } + + unlock_page(pp); + return (error); +} + +/* + * Populate a set of pages with data for the Linux page cache. This + * function will only be called for read ahead and never for demand + * paging. For simplicity, the code relies on read_cache_pages() to + * correctly lock each page for IO and call zpl_readpage(). + */ +static int +zpl_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return (read_cache_pages(mapping, pages, + (filler_t *)zpl_readpage, filp)); +} + +int +zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) +{ + struct address_space *mapping = data; + fstrans_cookie_t cookie; + + ASSERT(PageLocked(pp)); + ASSERT(!PageWriteback(pp)); + + cookie = spl_fstrans_mark(); + (void) zfs_putpage(mapping->host, pp, wbc); + spl_fstrans_unmark(cookie); + + return (0); +} + +static int +zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + znode_t *zp = ITOZ(mapping->host); + zfsvfs_t *zfsvfs = ITOZSB(mapping->host); + enum writeback_sync_modes sync_mode; + int result; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + ZFS_EXIT(zfsvfs); + sync_mode = wbc->sync_mode; + + /* + * We don't want to run write_cache_pages() in SYNC mode here, because + * that would make putpage() wait for a single page to be committed to + * disk every single time, resulting in atrocious performance. Instead + * we run it once in non-SYNC mode so that the ZIL gets all the data, + * and then we commit it all in one go. + */ + wbc->sync_mode = WB_SYNC_NONE; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + if (sync_mode != wbc->sync_mode) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + + /* + * We need to call write_cache_pages() again (we can't just + * return after the commit) because the previous call in + * non-SYNC mode does not guarantee that we got all the dirty + * pages (see the implementation of write_cache_pages() for + * details). That being said, this is a no-op in most cases. + */ + wbc->sync_mode = sync_mode; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + } + return (result); +} + +/* + * Write out dirty pages to the ARC, this function is only required to + * support mmap(2). Mapped pages may be dirtied by memory operations + * which never call .write(). These dirty pages are kept in sync with + * the ARC buffers via this hook. + */ +static int +zpl_writepage(struct page *pp, struct writeback_control *wbc) +{ + if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + + return (zpl_putpage(pp, wbc, pp->mapping)); +} + +/* + * The only flag combination which matches the behavior of zfs_space() + * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE + * flag was introduced in the 2.6.38 kernel. + */ +#if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) +long +zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) +{ + int error = -EOPNOTSUPP; + +#if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) + cred_t *cr = CRED(); + flock64_t bf; + loff_t olen; + fstrans_cookie_t cookie; + + if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return (error); + + if (offset < 0 || len <= 0) + return (-EINVAL); + + spl_inode_lock(ip); + olen = i_size_read(ip); + + if (offset > olen) { + spl_inode_unlock(ip); + return (0); + } + if (offset + len > olen) + len = olen - offset; + bf.l_type = F_WRLCK; + bf.l_whence = SEEK_SET; + bf.l_start = offset; + bf.l_len = len; + bf.l_pid = 0; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr); + spl_fstrans_unmark(cookie); + spl_inode_unlock(ip); + + crfree(cr); +#endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */ + + ASSERT3S(error, <=, 0); + return (error); +} +#endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */ + +#ifdef HAVE_FILE_FALLOCATE +static long +zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) +{ + return zpl_fallocate_common(file_inode(filp), + mode, offset, len); +} +#endif /* HAVE_FILE_FALLOCATE */ + +#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) +#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) + +static uint32_t +__zpl_ioctl_getflags(struct inode *ip) +{ + uint64_t zfs_flags = ITOZ(ip)->z_pflags; + uint32_t ioctl_flags = 0; + + if (zfs_flags & ZFS_IMMUTABLE) + ioctl_flags |= FS_IMMUTABLE_FL; + + if (zfs_flags & ZFS_APPENDONLY) + ioctl_flags |= FS_APPEND_FL; + + if (zfs_flags & ZFS_NODUMP) + ioctl_flags |= FS_NODUMP_FL; + + if (zfs_flags & ZFS_PROJINHERIT) + ioctl_flags |= ZFS_PROJINHERIT_FL; + + return (ioctl_flags & ZFS_FL_USER_VISIBLE); +} + +/* + * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file + * attributes common to both Linux and Solaris are mapped. + */ +static int +zpl_ioctl_getflags(struct file *filp, void __user *arg) +{ + uint32_t flags; + int err; + + flags = __zpl_ioctl_getflags(file_inode(filp)); + err = copy_to_user(arg, &flags, sizeof (flags)); + + return (err); +} + +/* + * fchange() is a helper macro to detect if we have been asked to change a + * flag. This is ugly, but the requirement that we do this is a consequence of + * how the Linux file attribute interface was designed. Another consequence is + * that concurrent modification of files suffers from a TOCTOU race. Neither + * are things we can fix without modifying the kernel-userland interface, which + * is outside of our jurisdiction. + */ + +#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) + +static int +__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) +{ + uint64_t zfs_flags = ITOZ(ip)->z_pflags; + xoptattr_t *xoap; + + if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | + ZFS_PROJINHERIT_FL)) + return (-EOPNOTSUPP); + + if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) + return (-EACCES); + + if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || + fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && + !capable(CAP_LINUX_IMMUTABLE)) + return (-EACCES); + + if (!zpl_inode_owner_or_capable(ip)) + return (-EACCES); + + xva_init(xva); + xoap = xva_getxoptattr(xva); + + XVA_SET_REQ(xva, XAT_IMMUTABLE); + if (ioctl_flags & FS_IMMUTABLE_FL) + xoap->xoa_immutable = B_TRUE; + + XVA_SET_REQ(xva, XAT_APPENDONLY); + if (ioctl_flags & FS_APPEND_FL) + xoap->xoa_appendonly = B_TRUE; + + XVA_SET_REQ(xva, XAT_NODUMP); + if (ioctl_flags & FS_NODUMP_FL) + xoap->xoa_nodump = B_TRUE; + + XVA_SET_REQ(xva, XAT_PROJINHERIT); + if (ioctl_flags & ZFS_PROJINHERIT_FL) + xoap->xoa_projinherit = B_TRUE; + + return (0); +} + +static int +zpl_ioctl_setflags(struct file *filp, void __user *arg) +{ + struct inode *ip = file_inode(filp); + uint32_t flags; + cred_t *cr = CRED(); + xvattr_t xva; + int err; + fstrans_cookie_t cookie; + + if (copy_from_user(&flags, arg, sizeof (flags))) + return (-EFAULT); + + err = __zpl_ioctl_setflags(ip, flags, &xva); + if (err) + return (err); + + crhold(cr); + cookie = spl_fstrans_mark(); + err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (err); +} + +static int +zpl_ioctl_getxattr(struct file *filp, void __user *arg) +{ + zfsxattr_t fsx = { 0 }; + struct inode *ip = file_inode(filp); + int err; + + fsx.fsx_xflags = __zpl_ioctl_getflags(ip); + fsx.fsx_projid = ITOZ(ip)->z_projid; + err = copy_to_user(arg, &fsx, sizeof (fsx)); + + return (err); +} + +static int +zpl_ioctl_setxattr(struct file *filp, void __user *arg) +{ + struct inode *ip = file_inode(filp); + zfsxattr_t fsx; + cred_t *cr = CRED(); + xvattr_t xva; + xoptattr_t *xoap; + int err; + fstrans_cookie_t cookie; + + if (copy_from_user(&fsx, arg, sizeof (fsx))) + return (-EFAULT); + + if (!zpl_is_valid_projid(fsx.fsx_projid)) + return (-EINVAL); + + err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); + if (err) + return (err); + + xoap = xva_getxoptattr(&xva); + XVA_SET_REQ(&xva, XAT_PROJID); + xoap->xoa_projid = fsx.fsx_projid; + + crhold(cr); + cookie = spl_fstrans_mark(); + err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (err); +} + +static long +zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC_GETFLAGS: + return (zpl_ioctl_getflags(filp, (void *)arg)); + case FS_IOC_SETFLAGS: + return (zpl_ioctl_setflags(filp, (void *)arg)); + case ZFS_IOC_FSGETXATTR: + return (zpl_ioctl_getxattr(filp, (void *)arg)); + case ZFS_IOC_FSSETXATTR: + return (zpl_ioctl_setxattr(filp, (void *)arg)); + default: + return (-ENOTTY); + } +} + +#ifdef CONFIG_COMPAT +static long +zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return (-ENOTTY); + } + return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); +} +#endif /* CONFIG_COMPAT */ + + +const struct address_space_operations zpl_address_space_operations = { + .readpages = zpl_readpages, + .readpage = zpl_readpage, + .writepage = zpl_writepage, + .writepages = zpl_writepages, + .direct_IO = zpl_direct_IO, +}; + +const struct file_operations zpl_file_operations = { + .open = zpl_open, + .release = zpl_release, + .llseek = zpl_llseek, +#ifdef HAVE_VFS_RW_ITERATE +#ifdef HAVE_NEW_SYNC_READ + .read = new_sync_read, + .write = new_sync_write, +#endif + .read_iter = zpl_iter_read, + .write_iter = zpl_iter_write, +#else + .read = do_sync_read, + .write = do_sync_write, + .aio_read = zpl_aio_read, + .aio_write = zpl_aio_write, +#endif + .mmap = zpl_mmap, + .fsync = zpl_fsync, +#ifdef HAVE_FILE_AIO_FSYNC + .aio_fsync = zpl_aio_fsync, +#endif +#ifdef HAVE_FILE_FALLOCATE + .fallocate = zpl_fallocate, +#endif /* HAVE_FILE_FALLOCATE */ + .unlocked_ioctl = zpl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = zpl_compat_ioctl, +#endif +}; + +const struct file_operations zpl_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, +#if defined(HAVE_VFS_ITERATE_SHARED) + .iterate_shared = zpl_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_iterate, +#else + .readdir = zpl_readdir, +#endif + .fsync = zpl_fsync, + .unlocked_ioctl = zpl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = zpl_compat_ioctl, +#endif +}; diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c new file mode 100644 index 000000000..3f3b2e2dc --- /dev/null +++ b/module/os/linux/zfs/zpl_inode.c @@ -0,0 +1,826 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + + +#include <sys/zfs_ctldir.h> +#include <sys/zfs_vfsops.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_znode.h> +#include <sys/dmu_objset.h> +#include <sys/vfs.h> +#include <sys/zpl.h> +#include <sys/file.h> + + +static struct dentry * +#ifdef HAVE_LOOKUP_NAMEIDATA +zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +#else +zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +#endif +{ + cred_t *cr = CRED(); + struct inode *ip; + int error; + fstrans_cookie_t cookie; + pathname_t *ppn = NULL; + pathname_t pn; + int zfs_flags = 0; + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + + if (dlen(dentry) >= ZAP_MAXNAMELEN) + return (ERR_PTR(-ENAMETOOLONG)); + + crhold(cr); + cookie = spl_fstrans_mark(); + + /* If we are a case insensitive fs, we need the real name */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + zfs_flags = FIGNORECASE; + pn_alloc(&pn); + ppn = &pn; + } + + error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + crfree(cr); + + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; +#ifndef HAVE_S_D_OP + d_set_d_op(dentry, &zpl_dentry_operations); +#endif /* HAVE_S_D_OP */ + spin_unlock(&dentry->d_lock); + + if (error) { + /* + * If we have a case sensitive fs, we do not want to + * insert negative entries, so return NULL for ENOENT. + * Fall through if the error is not ENOENT. Also free memory. + */ + if (ppn) { + pn_free(ppn); + if (error == -ENOENT) + return (NULL); + } + + if (error == -ENOENT) + return (d_splice_alias(NULL, dentry)); + else + return (ERR_PTR(error)); + } + + /* + * If we are case insensitive, call the correct function + * to install the name. + */ + if (ppn) { + struct dentry *new_dentry; + struct qstr ci_name; + + if (strcmp(dname(dentry), pn.pn_buf) == 0) { + new_dentry = d_splice_alias(ip, dentry); + } else { + ci_name.name = pn.pn_buf; + ci_name.len = strlen(pn.pn_buf); + new_dentry = d_add_ci(dentry, ip, &ci_name); + } + pn_free(ppn); + return (new_dentry); + } else { + return (d_splice_alias(ip, dentry)); + } +} + +void +zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr) +{ + vap->va_mask = ATTR_MODE; + vap->va_mode = mode; + vap->va_uid = crgetfsuid(cr); + + if (dir && dir->i_mode & S_ISGID) { + vap->va_gid = KGID_TO_SGID(dir->i_gid); + if (S_ISDIR(mode)) + vap->va_mode |= S_ISGID; + } else { + vap->va_gid = crgetfsgid(cr); + } +} + +static int +#ifdef HAVE_CREATE_NAMEIDATA +zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, + struct nameidata *nd) +#else +zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, + bool flag) +#endif +{ + cred_t *cr = CRED(); + struct inode *ip; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ip, dir); + + if (error) + (void) zfs_remove(dir, dname(dentry), cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode, + dev_t rdev) +{ + cred_t *cr = CRED(); + struct inode *ip; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + /* + * We currently expect Linux to supply rdev=0 for all sockets + * and fifos, but we want to know if this behavior ever changes. + */ + if (S_ISSOCK(mode) || S_ISFIFO(mode)) + ASSERT(rdev == 0); + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode, cr); + vap->va_rdev = rdev; + + cookie = spl_fstrans_mark(); + error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ip, dir); + + if (error) + (void) zfs_remove(dir, dname(dentry), cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_TMPFILE +static int +zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) +{ + cred_t *cr = CRED(); + struct inode *ip; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); + if (error == 0) { + /* d_tmpfile will do drop_nlink, so we should set it first */ + set_nlink(ip, 1); + d_tmpfile(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ip, dir); + /* + * don't need to handle error here, file is already in + * unlinked set. + */ + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} +#endif + +static int +zpl_unlink(struct inode *dir, struct dentry *dentry) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_remove(dir, dname(dentry), cr, 0); + + /* + * For a CI FS we must invalidate the dentry to prevent the + * creation of negative entries. + */ + if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) + d_invalidate(dentry); + + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode) +{ + cred_t *cr = CRED(); + vattr_t *vap; + struct inode *ip; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode | S_IFDIR, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ip, dir); + + if (error) + (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_rmdir(struct inode *dir, struct dentry *dentry) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0); + + /* + * For a CI FS we must invalidate the dentry to prevent the + * creation of negative entries. + */ + if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) + d_invalidate(dentry); + + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int error; + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + + /* + * XXX request_mask and query_flags currently ignored. + */ + + error = -zfs_getattr_fast(path->dentry->d_inode, stat); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + return (error); +} +ZPL_GETATTR_WRAPPER(zpl_getattr); + +static int +zpl_setattr(struct dentry *dentry, struct iattr *ia) +{ + struct inode *ip = dentry->d_inode; + cred_t *cr = CRED(); + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + error = setattr_prepare(dentry, ia); + if (error) + return (error); + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; + vap->va_mode = ia->ia_mode; + vap->va_uid = KUID_TO_SUID(ia->ia_uid); + vap->va_gid = KGID_TO_SGID(ia->ia_gid); + vap->va_size = ia->ia_size; + vap->va_atime = ia->ia_atime; + vap->va_mtime = ia->ia_mtime; + vap->va_ctime = ia->ia_ctime; + + if (vap->va_mask & ATTR_ATIME) { + ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime, + ip->i_sb->s_time_gran); + } + + cookie = spl_fstrans_mark(); + error = -zfs_setattr(ip, vap, 0, cr); + if (!error && (ia->ia_valid & ATTR_MODE)) + error = zpl_chmod_acl(ip); + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_rename2(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry, unsigned int flags) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + /* We don't have renameat2(2) support */ + if (flags) + return (-EINVAL); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifndef HAVE_RENAME_WANTS_FLAGS +static int +zpl_rename(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry) +{ + return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0)); +} +#endif + +static int +zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) +{ + cred_t *cr = CRED(); + vattr_t *vap; + struct inode *ip; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0); + if (error == 0) { + d_instantiate(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error) + (void) zfs_remove(dir, dname(dentry), cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#if defined(HAVE_PUT_LINK_COOKIE) +static void +zpl_put_link(struct inode *unused, void *cookie) +{ + kmem_free(cookie, MAXPATHLEN); +} +#elif defined(HAVE_PUT_LINK_NAMEIDATA) +static void +zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) +{ + const char *link = nd_get_link(nd); + + if (!IS_ERR(link)) + kmem_free(link, MAXPATHLEN); +} +#elif defined(HAVE_PUT_LINK_DELAYED) +static void +zpl_put_link(void *ptr) +{ + kmem_free(ptr, MAXPATHLEN); +} +#endif + +static int +zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + struct iovec iov; + uio_t uio = { { 0 }, 0 }; + int error; + + crhold(cr); + *link = NULL; + iov.iov_len = MAXPATHLEN; + iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_resid = (MAXPATHLEN - 1); + + cookie = spl_fstrans_mark(); + error = -zfs_readlink(ip, &uio, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error) + kmem_free(iov.iov_base, MAXPATHLEN); + else + *link = iov.iov_base; + + return (error); +} + +#if defined(HAVE_GET_LINK_DELAYED) +const char * +zpl_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + char *link = NULL; + int error; + + if (!dentry) + return (ERR_PTR(-ECHILD)); + + error = zpl_get_link_common(dentry, inode, &link); + if (error) + return (ERR_PTR(error)); + + set_delayed_call(done, zpl_put_link, link); + + return (link); +} +#elif defined(HAVE_GET_LINK_COOKIE) +const char * +zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) +{ + char *link = NULL; + int error; + + if (!dentry) + return (ERR_PTR(-ECHILD)); + + error = zpl_get_link_common(dentry, inode, &link); + if (error) + return (ERR_PTR(error)); + + return (*cookie = link); +} +#elif defined(HAVE_FOLLOW_LINK_COOKIE) +const char * +zpl_follow_link(struct dentry *dentry, void **cookie) +{ + char *link = NULL; + int error; + + error = zpl_get_link_common(dentry, dentry->d_inode, &link); + if (error) + return (ERR_PTR(error)); + + return (*cookie = link); +} +#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA) +static void * +zpl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *link = NULL; + int error; + + error = zpl_get_link_common(dentry, dentry->d_inode, &link); + if (error) + nd_set_link(nd, ERR_PTR(error)); + else + nd_set_link(nd, link); + + return (NULL); +} +#endif + +static int +zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + cred_t *cr = CRED(); + struct inode *ip = old_dentry->d_inode; + int error; + fstrans_cookie_t cookie; + + if (ip->i_nlink >= ZFS_LINK_MAX) + return (-EMLINK); + + crhold(cr); + ip->i_ctime = current_time(ip); + igrab(ip); /* Use ihold() if available */ + + cookie = spl_fstrans_mark(); + error = -zfs_link(dir, ip, dname(dentry), cr, 0); + if (error) { + iput(ip); + goto out; + } + + d_instantiate(dentry, ip); +out: + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_INODE_TRUNCATE_RANGE +static void +zpl_truncate_range(struct inode *ip, loff_t start, loff_t end) +{ + cred_t *cr = CRED(); + flock64_t bf; + fstrans_cookie_t cookie; + + ASSERT3S(start, <=, end); + + /* + * zfs_freesp() will interpret (len == 0) as meaning "truncate until + * the end of the file". We don't want that. + */ + if (start == end) + return; + + crhold(cr); + + bf.l_type = F_WRLCK; + bf.l_whence = SEEK_SET; + bf.l_start = start; + bf.l_len = end - start; + bf.l_pid = 0; + cookie = spl_fstrans_mark(); + zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr); + spl_fstrans_unmark(cookie); + + crfree(cr); +} +#endif /* HAVE_INODE_TRUNCATE_RANGE */ + +#ifdef HAVE_INODE_FALLOCATE +static long +zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len) +{ + return (zpl_fallocate_common(ip, mode, offset, len)); +} +#endif /* HAVE_INODE_FALLOCATE */ + +static int +#ifdef HAVE_D_REVALIDATE_NAMEIDATA +zpl_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned int flags = (nd ? nd->flags : 0); +#else +zpl_revalidate(struct dentry *dentry, unsigned int flags) +{ +#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ + /* CSTYLED */ + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + int error; + + if (flags & LOOKUP_RCU) + return (-ECHILD); + + /* + * Automounted snapshots rely on periodic dentry revalidation + * to defer snapshots from being automatically unmounted. + */ + if (zfsvfs->z_issnap) { + if (time_after(jiffies, zfsvfs->z_snap_defer_time + + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { + zfsvfs->z_snap_defer_time = jiffies; + zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, + dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); + } + } + + /* + * After a rollback negative dentries created before the rollback + * time must be invalidated. Otherwise they can obscure files which + * are only present in the rolled back dataset. + */ + if (dentry->d_inode == NULL) { + spin_lock(&dentry->d_lock); + error = time_before(dentry->d_time, zfsvfs->z_rollback_time); + spin_unlock(&dentry->d_lock); + + if (error) + return (0); + } + + /* + * The dentry may reference a stale inode if a mounted file system + * was rolled back to a point in time where the object didn't exist. + */ + if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) + return (0); + + return (1); +} + +const struct inode_operations zpl_inode_operations = { + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +#ifdef HAVE_INODE_TRUNCATE_RANGE + .truncate_range = zpl_truncate_range, +#endif /* HAVE_INODE_TRUNCATE_RANGE */ +#ifdef HAVE_INODE_FALLOCATE + .fallocate = zpl_fallocate, +#endif /* HAVE_INODE_FALLOCATE */ +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif +#if defined(HAVE_GET_ACL) + .get_acl = zpl_get_acl, +#elif defined(HAVE_CHECK_ACL) + .check_acl = zpl_check_acl, +#elif defined(HAVE_PERMISSION) + .permission = zpl_permission, +#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ +#endif /* CONFIG_FS_POSIX_ACL */ +}; + +const struct inode_operations zpl_dir_inode_operations = { + .create = zpl_create, + .lookup = zpl_lookup, + .link = zpl_link, + .unlink = zpl_unlink, + .symlink = zpl_symlink, + .mkdir = zpl_mkdir, + .rmdir = zpl_rmdir, + .mknod = zpl_mknod, +#ifdef HAVE_RENAME_WANTS_FLAGS + .rename = zpl_rename2, +#else + .rename = zpl_rename, +#endif +#ifdef HAVE_TMPFILE + .tmpfile = zpl_tmpfile, +#endif + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif +#if defined(HAVE_GET_ACL) + .get_acl = zpl_get_acl, +#elif defined(HAVE_CHECK_ACL) + .check_acl = zpl_check_acl, +#elif defined(HAVE_PERMISSION) + .permission = zpl_permission, +#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ +#endif /* CONFIG_FS_POSIX_ACL */ +}; + +const struct inode_operations zpl_symlink_inode_operations = { +#ifdef HAVE_GENERIC_READLINK + .readlink = generic_readlink, +#endif +#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) + .get_link = zpl_get_link, +#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) + .follow_link = zpl_follow_link, +#endif +#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA) + .put_link = zpl_put_link, +#endif + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +}; + +const struct inode_operations zpl_special_inode_operations = { + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif +#if defined(HAVE_GET_ACL) + .get_acl = zpl_get_acl, +#elif defined(HAVE_CHECK_ACL) + .check_acl = zpl_check_acl, +#elif defined(HAVE_PERMISSION) + .permission = zpl_permission, +#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */ +#endif /* CONFIG_FS_POSIX_ACL */ +}; + +dentry_operations_t zpl_dentry_operations = { + .d_revalidate = zpl_revalidate, +}; diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c new file mode 100644 index 000000000..810ab2898 --- /dev/null +++ b/module/os/linux/zfs/zpl_super.c @@ -0,0 +1,426 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + */ + + +#include <sys/zfs_vfsops.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_znode.h> +#include <sys/zfs_ctldir.h> +#include <sys/zpl.h> + + +static struct inode * +zpl_inode_alloc(struct super_block *sb) +{ + struct inode *ip; + + VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); + inode_set_iversion(ip, 1); + + return (ip); +} + +static void +zpl_inode_destroy(struct inode *ip) +{ + ASSERT(atomic_read(&ip->i_count) == 0); + zfs_inode_destroy(ip); +} + +/* + * Called from __mark_inode_dirty() to reflect that something in the + * inode has changed. We use it to ensure the znode system attributes + * are always strictly update to date with respect to the inode. + */ +#ifdef HAVE_DIRTY_INODE_WITH_FLAGS +static void +zpl_dirty_inode(struct inode *ip, int flags) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + zfs_dirty_inode(ip, flags); + spl_fstrans_unmark(cookie); +} +#else +static void +zpl_dirty_inode(struct inode *ip) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + zfs_dirty_inode(ip, 0); + spl_fstrans_unmark(cookie); +} +#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */ + +/* + * When ->drop_inode() is called its return value indicates if the + * inode should be evicted from the inode cache. If the inode is + * unhashed and has no links the default policy is to evict it + * immediately. + * + * Prior to 2.6.36 this eviction was accomplished by the vfs calling + * ->delete_inode(). It was ->delete_inode()'s responsibility to + * truncate the inode pages and call clear_inode(). The call to + * clear_inode() synchronously invalidates all the buffers and + * calls ->clear_inode(). It was ->clear_inode()'s responsibility + * to cleanup and filesystem specific data before freeing the inode. + * + * This elaborate mechanism was replaced by ->evict_inode() which + * does the job of both ->delete_inode() and ->clear_inode(). It + * will be called exactly once, and when it returns the inode must + * be in a state where it can simply be freed.i + * + * The ->evict_inode() callback must minimally truncate the inode pages, + * and call clear_inode(). For 2.6.35 and later kernels this will + * simply update the inode state, with the sync occurring before the + * truncate in evict(). For earlier kernels clear_inode() maps to + * end_writeback() which is responsible for completing all outstanding + * write back. In either case, once this is done it is safe to cleanup + * any remaining inode specific data via zfs_inactive(). + * remaining filesystem specific data. + */ +#ifdef HAVE_EVICT_INODE +static void +zpl_evict_inode(struct inode *ip) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + truncate_setsize(ip, 0); + clear_inode(ip); + zfs_inactive(ip); + spl_fstrans_unmark(cookie); +} + +#else + +static void +zpl_drop_inode(struct inode *ip) +{ + generic_delete_inode(ip); +} + +static void +zpl_clear_inode(struct inode *ip) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + zfs_inactive(ip); + spl_fstrans_unmark(cookie); +} + +static void +zpl_inode_delete(struct inode *ip) +{ + truncate_setsize(ip, 0); + clear_inode(ip); +} +#endif /* HAVE_EVICT_INODE */ + +static void +zpl_put_super(struct super_block *sb) +{ + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_umount(sb); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); +} + +static int +zpl_sync_fs(struct super_block *sb, int wait) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_sync(sb, wait, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_statfs(struct dentry *dentry, struct kstatfs *statp) +{ + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_statvfs(dentry, statp); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + /* + * If required by a 32-bit system call, dynamically scale the + * block size up to 16MiB and decrease the block counts. This + * allows for a maximum size of 64EiB to be reported. The file + * counts must be artificially capped at 2^32-1. + */ + if (unlikely(zpl_is_32bit_api())) { + while (statp->f_blocks > UINT32_MAX && + statp->f_bsize < SPA_MAXBLOCKSIZE) { + statp->f_frsize <<= 1; + statp->f_bsize <<= 1; + + statp->f_blocks >>= 1; + statp->f_bfree >>= 1; + statp->f_bavail >>= 1; + } + + uint64_t usedobjs = statp->f_files - statp->f_ffree; + statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); + statp->f_files = statp->f_ffree + usedobjs; + } + + return (error); +} + +static int +zpl_remount_fs(struct super_block *sb, int *flags, char *data) +{ + zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_remount(sb, flags, &zm); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) +{ + seq_printf(seq, ",%s", + zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); + +#ifdef CONFIG_FS_POSIX_ACL + switch (zfsvfs->z_acl_type) { + case ZFS_ACLTYPE_POSIXACL: + seq_puts(seq, ",posixacl"); + break; + default: + seq_puts(seq, ",noacl"); + break; + } +#endif /* CONFIG_FS_POSIX_ACL */ + + return (0); +} + +#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY +static int +zpl_show_options(struct seq_file *seq, struct dentry *root) +{ + return (__zpl_show_options(seq, root->d_sb->s_fs_info)); +} +#else +static int +zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp) +{ + return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info)); +} +#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */ + +static int +zpl_fill_super(struct super_block *sb, void *data, int silent) +{ + zfs_mnt_t *zm = (zfs_mnt_t *)data; + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_domount(sb, zm, silent); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_test_super(struct super_block *s, void *data) +{ + zfsvfs_t *zfsvfs = s->s_fs_info; + objset_t *os = data; + + if (zfsvfs == NULL) + return (0); + + return (os == zfsvfs->z_os); +} + +static struct super_block * +zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) +{ + struct super_block *s; + objset_t *os; + int err; + + err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); + if (err) + return (ERR_PTR(-err)); + + /* + * The dsl pool lock must be released prior to calling sget(). + * It is possible sget() may block on the lock in grab_super() + * while deactivate_super() holds that same lock and waits for + * a txg sync. If the dsl_pool lock is held over sget() + * this can prevent the pool sync and cause a deadlock. + */ + dsl_pool_rele(dmu_objset_pool(os), FTAG); + s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os); + dsl_dataset_rele(dmu_objset_ds(os), FTAG); + + if (IS_ERR(s)) + return (ERR_CAST(s)); + + if (s->s_root == NULL) { + err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); + if (err) { + deactivate_locked_super(s); + return (ERR_PTR(err)); + } + s->s_flags |= SB_ACTIVE; + } else if ((flags ^ s->s_flags) & SB_RDONLY) { + deactivate_locked_super(s); + return (ERR_PTR(-EBUSY)); + } + + return (s); +} + +#ifdef HAVE_FST_MOUNT +static struct dentry * +zpl_mount(struct file_system_type *fs_type, int flags, + const char *osname, void *data) +{ + zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; + + struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); + if (IS_ERR(sb)) + return (ERR_CAST(sb)); + + return (dget(sb->s_root)); +} +#else +static int +zpl_get_sb(struct file_system_type *fs_type, int flags, + const char *osname, void *data, struct vfsmount *mnt) +{ + zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; + + struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); + if (IS_ERR(sb)) + return (PTR_ERR(sb)); + + (void) simple_set_mnt(mnt, sb); + + return (0); +} +#endif /* HAVE_FST_MOUNT */ + +static void +zpl_kill_sb(struct super_block *sb) +{ + zfs_preumount(sb); + kill_anon_super(sb); + +#ifdef HAVE_S_INSTANCES_LIST_HEAD + sb->s_instances.next = &(zpl_fs_type.fs_supers); +#endif /* HAVE_S_INSTANCES_LIST_HEAD */ +} + +void +zpl_prune_sb(int64_t nr_to_scan, void *arg) +{ + struct super_block *sb = (struct super_block *)arg; + int objects = 0; + + (void) -zfs_prune(sb, nr_to_scan, &objects); +} + +#ifdef HAVE_NR_CACHED_OBJECTS +static int +zpl_nr_cached_objects(struct super_block *sb) +{ + return (0); +} +#endif /* HAVE_NR_CACHED_OBJECTS */ + +#ifdef HAVE_FREE_CACHED_OBJECTS +static void +zpl_free_cached_objects(struct super_block *sb, int nr_to_scan) +{ + /* noop */ +} +#endif /* HAVE_FREE_CACHED_OBJECTS */ + +const struct super_operations zpl_super_operations = { + .alloc_inode = zpl_inode_alloc, + .destroy_inode = zpl_inode_destroy, + .dirty_inode = zpl_dirty_inode, + .write_inode = NULL, +#ifdef HAVE_EVICT_INODE + .evict_inode = zpl_evict_inode, +#else + .drop_inode = zpl_drop_inode, + .clear_inode = zpl_clear_inode, + .delete_inode = zpl_inode_delete, +#endif /* HAVE_EVICT_INODE */ + .put_super = zpl_put_super, + .sync_fs = zpl_sync_fs, + .statfs = zpl_statfs, + .remount_fs = zpl_remount_fs, + .show_options = zpl_show_options, + .show_stats = NULL, +#ifdef HAVE_NR_CACHED_OBJECTS + .nr_cached_objects = zpl_nr_cached_objects, +#endif /* HAVE_NR_CACHED_OBJECTS */ +#ifdef HAVE_FREE_CACHED_OBJECTS + .free_cached_objects = zpl_free_cached_objects, +#endif /* HAVE_FREE_CACHED_OBJECTS */ +}; + +struct file_system_type zpl_fs_type = { + .owner = THIS_MODULE, + .name = ZFS_DRIVER, +#ifdef HAVE_FST_MOUNT + .mount = zpl_mount, +#else + .get_sb = zpl_get_sb, +#endif /* HAVE_FST_MOUNT */ + .kill_sb = zpl_kill_sb, +}; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c new file mode 100644 index 000000000..95523f28e --- /dev/null +++ b/module/os/linux/zfs/zpl_xattr.c @@ -0,0 +1,1548 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * + * Extended attributes (xattr) on Solaris are implemented as files + * which exist in a hidden xattr directory. These extended attributes + * can be accessed using the attropen() system call which opens + * the extended attribute. It can then be manipulated just like + * a standard file descriptor. This has a couple advantages such + * as practically no size limit on the file, and the extended + * attributes permissions may differ from those of the parent file. + * This interface is really quite clever, but it's also completely + * different than what is supported on Linux. It also comes with a + * steep performance penalty when accessing small xattrs because they + * are not stored with the parent file. + * + * Under Linux extended attributes are manipulated by the system + * calls getxattr(2), setxattr(2), and listxattr(2). They consider + * extended attributes to be name/value pairs where the name is a + * NULL terminated string. The name must also include one of the + * following namespace prefixes: + * + * user - No restrictions and is available to user applications. + * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. + * system - Used for access control lists (system.nfs4_acl, etc). + * security - Used by SELinux to store a files security context. + * + * The value under Linux to limited to 65536 bytes of binary data. + * In practice, individual xattrs tend to be much smaller than this + * and are typically less than 100 bytes. A good example of this + * are the security.selinux xattrs which are less than 100 bytes and + * exist for every file when xattr labeling is enabled. + * + * The Linux xattr implementation has been written to take advantage of + * this typical usage. When the dataset property 'xattr=sa' is set, + * then xattrs will be preferentially stored as System Attributes (SA). + * This allows tiny xattrs (~100 bytes) to be stored with the dnode and + * up to 64k of xattrs to be stored in the spill block. If additional + * xattr space is required, which is unlikely under Linux, they will + * be stored using the traditional directory approach. + * + * This optimization results in roughly a 3x performance improvement + * when accessing xattrs because it avoids the need to perform a seek + * for every xattr value. When multiple xattrs are stored per-file + * the performance improvements are even greater because all of the + * xattrs stored in the spill block will be cached. + * + * However, by default SA based xattrs are disabled in the Linux port + * to maximize compatibility with other implementations. If you do + * enable SA based xattrs then they will not be visible on platforms + * which do not support this feature. + * + * NOTE: One additional consequence of the xattr directory implementation + * is that when an extended attribute is manipulated an inode is created. + * This inode will exist in the Linux inode cache but there will be no + * associated entry in the dentry cache which references it. This is + * safe but it may result in some confusion. Enabling SA based xattrs + * largely avoids the issue except in the overflow case. + */ + +#include <sys/zfs_vfsops.h> +#include <sys/zfs_vnops.h> +#include <sys/zfs_znode.h> +#include <sys/zap.h> +#include <sys/vfs.h> +#include <sys/zpl.h> + +typedef struct xattr_filldir { + size_t size; + size_t offset; + char *buf; + struct dentry *dentry; +} xattr_filldir_t; + +static const struct xattr_handler *zpl_xattr_handler(const char *); + +static int +zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) +{ + static const struct xattr_handler *handler; + struct dentry *d = xf->dentry; + + handler = zpl_xattr_handler(name); + if (!handler) + return (0); + + if (handler->list) { +#if defined(HAVE_XATTR_LIST_SIMPLE) + if (!handler->list(d)) + return (0); +#elif defined(HAVE_XATTR_LIST_DENTRY) + if (!handler->list(d, NULL, 0, name, name_len, 0)) + return (0); +#elif defined(HAVE_XATTR_LIST_HANDLER) + if (!handler->list(handler, d, NULL, 0, name, name_len)) + return (0); +#elif defined(HAVE_XATTR_LIST_INODE) + if (!handler->list(d->d_inode, NULL, 0, name, name_len)) + return (0); +#endif + } + + return (1); +} + +/* + * Determine is a given xattr name should be visible and if so copy it + * in to the provided buffer (xf->buf). + */ +static int +zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) +{ + /* Check permissions using the per-namespace list xattr handler. */ + if (!zpl_xattr_permission(xf, name, name_len)) + return (0); + + /* When xf->buf is NULL only calculate the required size. */ + if (xf->buf) { + if (xf->offset + name_len + 1 > xf->size) + return (-ERANGE); + + memcpy(xf->buf + xf->offset, name, name_len); + xf->buf[xf->offset + name_len] = '\0'; + } + + xf->offset += (name_len + 1); + + return (0); +} + +/* + * Read as many directory entry names as will fit in to the provided buffer, + * or when no buffer is provided calculate the required buffer size. + */ +int +zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) +{ + zap_cursor_t zc; + zap_attribute_t zap; + int error; + + zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id); + + while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) { + + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + error = -ENXIO; + break; + } + + error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name)); + if (error) + break; + + zap_cursor_advance(&zc); + } + + zap_cursor_fini(&zc); + + if (error == -ENOENT) + error = 0; + + return (error); +} + +static ssize_t +zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) +{ + struct inode *ip = xf->dentry->d_inode; + struct inode *dxip = NULL; + int error; + + /* Lookup the xattr directory */ + error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); + if (error) { + if (error == -ENOENT) + error = 0; + + return (error); + } + + error = zpl_xattr_readdir(dxip, xf); + iput(dxip); + + return (error); +} + +static ssize_t +zpl_xattr_list_sa(xattr_filldir_t *xf) +{ + znode_t *zp = ITOZ(xf->dentry->d_inode); + nvpair_t *nvp = NULL; + int error = 0; + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + + while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { + ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); + + error = zpl_xattr_filldir(xf, nvpair_name(nvp), + strlen(nvpair_name(nvp))); + if (error) + return (error); + } + + return (0); +} + +ssize_t +zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + znode_t *zp = ITOZ(dentry->d_inode); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + xattr_filldir_t xf = { buffer_size, 0, buffer, dentry }; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int error = 0; + + crhold(cr); + cookie = spl_fstrans_mark(); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_READER); + + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_list_sa(&xf); + if (error) + goto out; + } + + error = zpl_xattr_list_dir(&xf, cr); + if (error) + goto out; + + error = xf.offset; +out: + + rw_exit(&zp->z_xattr_lock); + ZPL_EXIT(zfsvfs); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (error); +} + +static int +zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, + size_t size, cred_t *cr) +{ + struct inode *dxip = NULL; + struct inode *xip = NULL; + loff_t pos = 0; + int error; + + /* Lookup the xattr directory */ + error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL); + if (error) + goto out; + + /* Lookup a specific xattr name in the directory */ + error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); + if (error) + goto out; + + if (!size) { + error = i_size_read(xip); + goto out; + } + + if (size < i_size_read(xip)) { + error = -ERANGE; + goto out; + } + + error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); +out: + if (xip) + iput(xip); + + if (dxip) + iput(dxip); + + return (error); +} + +static int +zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) +{ + znode_t *zp = ITOZ(ip); + uchar_t *nv_value; + uint_t nv_size; + int error = 0; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, + &nv_value, &nv_size); + if (error) + return (error); + + if (size == 0 || value == NULL) + return (nv_size); + + if (size < nv_size) + return (-ERANGE); + + memcpy(value, nv_value, nv_size); + + return (nv_size); +} + +static int +__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, + cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_get_sa(ip, name, value, size); + if (error != -ENOENT) + goto out; + } + + error = zpl_xattr_get_dir(ip, name, value, size, cr); +out: + if (error == -ENOENT) + error = -ENODATA; + + return (error); +} + +#define XATTR_NOENT 0x0 +#define XATTR_IN_SA 0x1 +#define XATTR_IN_DIR 0x2 +/* check where the xattr resides */ +static int +__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ASSERT(where); + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + *where = XATTR_NOENT; + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_get_sa(ip, name, NULL, 0); + if (error >= 0) + *where |= XATTR_IN_SA; + else if (error != -ENOENT) + return (error); + } + + error = zpl_xattr_get_dir(ip, name, NULL, 0, cr); + if (error >= 0) + *where |= XATTR_IN_DIR; + else if (error != -ENOENT) + return (error); + + if (*where == (XATTR_IN_SA|XATTR_IN_DIR)) + cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\"" + " in both SA and dir", ip, name); + if (*where == XATTR_NOENT) + error = -ENODATA; + else + error = 0; + return (error); +} + +static int +zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_READER); + error = __zpl_xattr_get(ip, name, value, size, cr); + rw_exit(&zp->z_xattr_lock); + ZPL_EXIT(zfsvfs); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (error); +} + +static int +zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + struct inode *dxip = NULL; + struct inode *xip = NULL; + vattr_t *vap = NULL; + ssize_t wrote; + int lookup_flags, error; + const int xattr_mode = S_IFREG | 0644; + loff_t pos = 0; + + /* + * Lookup the xattr directory. When we're adding an entry pass + * CREATE_XATTR_DIR to ensure the xattr directory is created. + * When removing an entry this flag is not passed to avoid + * unnecessarily creating a new xattr directory. + */ + lookup_flags = LOOKUP_XATTR; + if (value != NULL) + lookup_flags |= CREATE_XATTR_DIR; + + error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL); + if (error) + goto out; + + /* Lookup a specific xattr name in the directory */ + error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL); + if (error && (error != -ENOENT)) + goto out; + + error = 0; + + /* Remove a specific name xattr when value is set to NULL. */ + if (value == NULL) { + if (xip) + error = -zfs_remove(dxip, (char *)name, cr, 0); + + goto out; + } + + /* Lookup failed create a new xattr. */ + if (xip == NULL) { + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + vap->va_mode = xattr_mode; + vap->va_mask = ATTR_MODE; + vap->va_uid = crgetfsuid(cr); + vap->va_gid = crgetfsgid(cr); + + error = -zfs_create(dxip, (char *)name, vap, 0, 0644, &xip, + cr, 0, NULL); + if (error) + goto out; + } + + ASSERT(xip != NULL); + + error = -zfs_freesp(ITOZ(xip), 0, 0, xattr_mode, TRUE); + if (error) + goto out; + + wrote = zpl_write_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); + if (wrote < 0) + error = wrote; + +out: + + if (error == 0) { + ip->i_ctime = current_time(ip); + zfs_mark_inode_dirty(ip); + } + + if (vap) + kmem_free(vap, sizeof (vattr_t)); + + if (xip) + iput(xip); + + if (dxip) + iput(dxip); + + if (error == -ENOENT) + error = -ENODATA; + + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + nvlist_t *nvl; + size_t sa_size; + int error = 0; + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + if (value == NULL) { + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + if (error == -ENOENT) + error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); + } else { + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) + return (-EFBIG); + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + return (error); + + if (sa_size > DXATTR_MAX_SA_SIZE) + return (-EFBIG); + + error = -nvlist_add_byte_array(nvl, name, + (uchar_t *)value, size); + } + + /* + * Update the SA for additions, modifications, and removals. On + * error drop the inconsistent cached version of the nvlist, it + * will be reconstructed from the ARC when next accessed. + */ + if (error == 0) + error = -zfs_sa_set_xattr(zp); + + if (error) { + nvlist_free(nvl); + zp->z_xattr_cached = NULL; + } + + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_xattr_set(struct inode *ip, const char *name, const void *value, + size_t size, int flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int where; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); + rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); + + /* + * Before setting the xattr check to see if it already exists. + * This is done to ensure the following optional flags are honored. + * + * XATTR_CREATE: fail if xattr already exists + * XATTR_REPLACE: fail if xattr does not exist + * + * We also want to know if it resides in sa or dir, so we can make + * sure we don't end up with duplicate in both places. + */ + error = __zpl_xattr_where(ip, name, &where, cr); + if (error < 0) { + if (error != -ENODATA) + goto out; + if (flags & XATTR_REPLACE) + goto out; + + /* The xattr to be removed already doesn't exist */ + error = 0; + if (value == NULL) + goto out; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto out; + } + + /* Preferentially store the xattr as a SA for better performance */ + if (zfsvfs->z_use_sa && zp->z_is_sa && + (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { + error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); + if (error == 0) { + /* + * Successfully put into SA, we need to clear the one + * in dir. + */ + if (where & XATTR_IN_DIR) + zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); + goto out; + } + } + + error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); + /* + * Successfully put into dir, we need to clear the one in SA. + */ + if (error == 0 && (where & XATTR_IN_SA)) + zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); +out: + rw_exit(&ITOZ(ip)->z_xattr_lock); + ZPL_EXIT(zfsvfs); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +/* + * Extended user attributes + * + * "Extended user attributes may be assigned to files and directories for + * storing arbitrary additional information such as the mime type, + * character set or encoding of a file. The access permissions for user + * attributes are defined by the file permission bits: read permission + * is required to retrieve the attribute value, and writer permission is + * required to change it. + * + * The file permission bits of regular files and directories are + * interpreted differently from the file permission bits of special + * files and symbolic links. For regular files and directories the file + * permission bits define access to the file's contents, while for + * device special files they define access to the device described by + * the special file. The file permissions of symbolic links are not + * used in access checks. These differences would allow users to + * consume filesystem resources in a way not controllable by disk quotas + * for group or world writable special files and directories. + * + * For this reason, extended user attributes are allowed only for + * regular files and directories, and access to extended user attributes + * is restricted to the owner and to users with appropriate capabilities + * for directories with the sticky bit set (see the chmod(1) manual page + * for an explanation of the sticky bit)." - xattr(7) + * + * ZFS allows extended user attributes to be disabled administratively + * by setting the 'xattr=off' property on the dataset. + */ +static int +__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return (ITOZSB(ip)->z_flags & ZSB_XATTR); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list); + +static int +__zpl_xattr_user_get(struct inode *ip, const char *name, + void *value, size_t size) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) + return (-EOPNOTSUPP); + + xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); + error = zpl_xattr_get(ip, xattr_name, value, size); + strfree(xattr_name); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); + +static int +__zpl_xattr_user_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) + return (-EOPNOTSUPP); + + xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); + error = zpl_xattr_set(ip, xattr_name, value, size, flags); + strfree(xattr_name); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); + +xattr_handler_t zpl_xattr_user_handler = +{ + .prefix = XATTR_USER_PREFIX, + .list = zpl_xattr_user_list, + .get = zpl_xattr_user_get, + .set = zpl_xattr_user_set, +}; + +/* + * Trusted extended attributes + * + * "Trusted extended attributes are visible and accessible only to + * processes that have the CAP_SYS_ADMIN capability. Attributes in this + * class are used to implement mechanisms in user space (i.e., outside + * the kernel) which keep information in extended attributes to which + * ordinary processes should not have access." - xattr(7) + */ +static int +__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return (capable(CAP_SYS_ADMIN)); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list); + +static int +__zpl_xattr_trusted_get(struct inode *ip, const char *name, + void *value, size_t size) +{ + char *xattr_name; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return (-EACCES); + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); + error = zpl_xattr_get(ip, xattr_name, value, size); + strfree(xattr_name); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); + +static int +__zpl_xattr_trusted_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + char *xattr_name; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return (-EACCES); + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); + error = zpl_xattr_set(ip, xattr_name, value, size, flags); + strfree(xattr_name); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); + +xattr_handler_t zpl_xattr_trusted_handler = +{ + .prefix = XATTR_TRUSTED_PREFIX, + .list = zpl_xattr_trusted_list, + .get = zpl_xattr_trusted_get, + .set = zpl_xattr_trusted_set, +}; + +/* + * Extended security attributes + * + * "The security attribute namespace is used by kernel security modules, + * such as Security Enhanced Linux, and also to implement file + * capabilities (see capabilities(7)). Read and write access + * permissions to security attributes depend on the policy implemented + * for each security attribute by the security module. When no security + * module is loaded, all processes have read access to extended security + * attributes, and write access is limited to processes that have the + * CAP_SYS_ADMIN capability." - xattr(7) + */ +static int +__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return (1); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list); + +static int +__zpl_xattr_security_get(struct inode *ip, const char *name, + void *value, size_t size) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); + error = zpl_xattr_get(ip, xattr_name, value, size); + strfree(xattr_name); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); + +static int +__zpl_xattr_security_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); + error = zpl_xattr_set(ip, xattr_name, value, size, flags); + strfree(xattr_name); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); + +#ifdef HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY +static int +__zpl_xattr_security_init(struct inode *ip, const struct xattr *xattrs, + void *fs_info) +{ + const struct xattr *xattr; + int error = 0; + + for (xattr = xattrs; xattr->name != NULL; xattr++) { + error = __zpl_xattr_security_set(ip, + xattr->name, xattr->value, xattr->value_len, 0); + + if (error < 0) + break; + } + + return (error); +} + +int +zpl_xattr_security_init(struct inode *ip, struct inode *dip, + const struct qstr *qstr) +{ + return security_inode_init_security(ip, dip, qstr, + &__zpl_xattr_security_init, NULL); +} + +#else +int +zpl_xattr_security_init(struct inode *ip, struct inode *dip, + const struct qstr *qstr) +{ + int error; + size_t len; + void *value; + char *name; + + error = zpl_security_inode_init_security(ip, dip, qstr, + &name, &value, &len); + if (error) { + if (error == -EOPNOTSUPP) + return (0); + + return (error); + } + + error = __zpl_xattr_security_set(ip, name, value, len, 0); + + kfree(name); + kfree(value); + + return (error); +} +#endif /* HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY */ + +/* + * Security xattr namespace handlers. + */ +xattr_handler_t zpl_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = zpl_xattr_security_list, + .get = zpl_xattr_security_get, + .set = zpl_xattr_security_set, +}; + +/* + * Extended system attributes + * + * "Extended system attributes are used by the kernel to store system + * objects such as Access Control Lists. Read and write access permissions + * to system attributes depend on the policy implemented for each system + * attribute implemented by filesystems in the kernel." - xattr(7) + */ +#ifdef CONFIG_FS_POSIX_ACL +int +zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) +{ + char *name, *value = NULL; + int error = 0; + size_t size = 0; + + if (S_ISLNK(ip->i_mode)) + return (-EOPNOTSUPP); + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + if (acl) { + zpl_equivmode_t mode = ip->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) { + return (error); + } else { + /* + * The mode bits will have been set by + * ->zfs_setattr()->zfs_acl_chmod_setattr() + * using the ZFS ACL conversion. If they + * differ from the Posix ACL conversion dirty + * the inode to write the Posix mode bits. + */ + if (ip->i_mode != mode) { + ip->i_mode = mode; + ip->i_ctime = current_time(ip); + zfs_mark_inode_dirty(ip); + } + + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + if (!S_ISDIR(ip->i_mode)) + return (acl ? -EACCES : 0); + break; + + default: + return (-EINVAL); + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmem_alloc(size, KM_SLEEP); + + error = zpl_acl_to_xattr(acl, value, size); + if (error < 0) { + kmem_free(value, size); + return (error); + } + } + + error = zpl_xattr_set(ip, name, value, size, 0); + if (value) + kmem_free(value, size); + + if (!error) { + if (acl) + zpl_set_cached_acl(ip, type, acl); + else + zpl_forget_cached_acl(ip, type); + } + + return (error); +} + +struct posix_acl * +zpl_get_acl(struct inode *ip, int type) +{ + struct posix_acl *acl; + void *value = NULL; + char *name; + int size; + + /* + * As of Linux 3.14, the kernel get_acl will check this for us. + * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong + * as the kernel get_acl will set it to temporary sentinel value. + */ +#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE + acl = get_cached_acl(ip, type); + if (acl != ACL_NOT_CACHED) + return (acl); +#endif + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + return (ERR_PTR(-EINVAL)); + } + + size = zpl_xattr_get(ip, name, NULL, 0); + if (size > 0) { + value = kmem_alloc(size, KM_SLEEP); + size = zpl_xattr_get(ip, name, value, size); + } + + if (size > 0) { + acl = zpl_acl_from_xattr(value, size); + } else if (size == -ENODATA || size == -ENOSYS) { + acl = NULL; + } else { + acl = ERR_PTR(-EIO); + } + + if (size > 0) + kmem_free(value, size); + + /* As of Linux 4.7, the kernel get_acl will set this for us */ +#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE + if (!IS_ERR(acl)) + zpl_set_cached_acl(ip, type, acl); +#endif + + return (acl); +} + +#if !defined(HAVE_GET_ACL) +static int +__zpl_check_acl(struct inode *ip, int mask) +{ + struct posix_acl *acl; + int error; + + acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + + if (acl) { + error = posix_acl_permission(ip, acl, mask); + zpl_posix_acl_release(acl); + return (error); + } + + return (-EAGAIN); +} + +#if defined(HAVE_CHECK_ACL_WITH_FLAGS) +int +zpl_check_acl(struct inode *ip, int mask, unsigned int flags) +{ + return (__zpl_check_acl(ip, mask)); +} +#elif defined(HAVE_CHECK_ACL) +int +zpl_check_acl(struct inode *ip, int mask) +{ + return (__zpl_check_acl(ip, mask)); +} +#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA) +int +zpl_permission(struct inode *ip, int mask, struct nameidata *nd) +{ + return (generic_permission(ip, mask, __zpl_check_acl)); +} +#elif defined(HAVE_PERMISSION) +int +zpl_permission(struct inode *ip, int mask) +{ + return (generic_permission(ip, mask, __zpl_check_acl)); +} +#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */ +#endif /* !HAVE_GET_ACL */ + +int +zpl_init_acl(struct inode *ip, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (!S_ISLNK(ip->i_mode)) { + acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + if (!acl) { + ip->i_mode &= ~current_umask(); + ip->i_ctime = current_time(ip); + zfs_mark_inode_dirty(ip); + return (0); + } + } + + if (acl) { + umode_t mode; + + if (S_ISDIR(ip->i_mode)) { + error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT); + if (error) + goto out; + } + + mode = ip->i_mode; + error = __posix_acl_create(&acl, GFP_KERNEL, &mode); + if (error >= 0) { + ip->i_mode = mode; + zfs_mark_inode_dirty(ip); + if (error > 0) + error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); + } + } +out: + zpl_posix_acl_release(acl); + + return (error); +} + +int +zpl_chmod_acl(struct inode *ip) +{ + struct posix_acl *acl; + int error; + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (S_ISLNK(ip->i_mode)) + return (-EOPNOTSUPP); + + acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return (PTR_ERR(acl)); + + error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); + if (!error) + error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); + + zpl_posix_acl_release(acl); + + return (error); +} + +static int +__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; + size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (list && xattr_size <= list_size) + memcpy(list, xattr_name, xattr_size); + + return (xattr_size); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access); + +static int +__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; + size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (list && xattr_size <= list_size) + memcpy(list, xattr_name, xattr_size); + + return (xattr_size); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default); + +static int +__zpl_xattr_acl_get_access(struct inode *ip, const char *name, + void *buffer, size_t size) +{ + struct posix_acl *acl; + int type = ACL_TYPE_ACCESS; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + acl = zpl_get_acl(ip, type); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + if (acl == NULL) + return (-ENODATA); + + error = zpl_acl_to_xattr(acl, buffer, size); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access); + +static int +__zpl_xattr_acl_get_default(struct inode *ip, const char *name, + void *buffer, size_t size) +{ + struct posix_acl *acl; + int type = ACL_TYPE_DEFAULT; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + acl = zpl_get_acl(ip, type); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + if (acl == NULL) + return (-ENODATA); + + error = zpl_acl_to_xattr(acl, buffer, size); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); + +static int +__zpl_xattr_acl_set_access(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + struct posix_acl *acl; + int type = ACL_TYPE_ACCESS; + int error = 0; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + if (!zpl_inode_owner_or_capable(ip)) + return (-EPERM); + + if (value) { + acl = zpl_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + else if (acl) { + error = zpl_posix_acl_valid(ip, acl); + if (error) { + zpl_posix_acl_release(acl); + return (error); + } + } + } else { + acl = NULL; + } + + error = zpl_set_acl(ip, acl, type); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); + +static int +__zpl_xattr_acl_set_default(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + struct posix_acl *acl; + int type = ACL_TYPE_DEFAULT; + int error = 0; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + if (!zpl_inode_owner_or_capable(ip)) + return (-EPERM); + + if (value) { + acl = zpl_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + else if (acl) { + error = zpl_posix_acl_valid(ip, acl); + if (error) { + zpl_posix_acl_release(acl); + return (error); + } + } + } else { + acl = NULL; + } + + error = zpl_set_acl(ip, acl, type); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default); + +/* + * ACL access xattr namespace handlers. + * + * Use .name instead of .prefix when available. xattr_resolve_name will match + * whole name and reject anything that has .name only as prefix. + */ +xattr_handler_t zpl_xattr_acl_access_handler = +{ +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_ACCESS, +#else + .prefix = XATTR_NAME_POSIX_ACL_ACCESS, +#endif + .list = zpl_xattr_acl_list_access, + .get = zpl_xattr_acl_get_access, + .set = zpl_xattr_acl_set_access, +#if defined(HAVE_XATTR_LIST_SIMPLE) || \ + defined(HAVE_XATTR_LIST_DENTRY) || \ + defined(HAVE_XATTR_LIST_HANDLER) + .flags = ACL_TYPE_ACCESS, +#endif +}; + +/* + * ACL default xattr namespace handlers. + * + * Use .name instead of .prefix when available. xattr_resolve_name will match + * whole name and reject anything that has .name only as prefix. + */ +xattr_handler_t zpl_xattr_acl_default_handler = +{ +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_DEFAULT, +#else + .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, +#endif + .list = zpl_xattr_acl_list_default, + .get = zpl_xattr_acl_get_default, + .set = zpl_xattr_acl_set_default, +#if defined(HAVE_XATTR_LIST_SIMPLE) || \ + defined(HAVE_XATTR_LIST_DENTRY) || \ + defined(HAVE_XATTR_LIST_HANDLER) + .flags = ACL_TYPE_DEFAULT, +#endif +}; + +#endif /* CONFIG_FS_POSIX_ACL */ + +xattr_handler_t *zpl_xattr_handlers[] = { + &zpl_xattr_security_handler, + &zpl_xattr_trusted_handler, + &zpl_xattr_user_handler, +#ifdef CONFIG_FS_POSIX_ACL + &zpl_xattr_acl_access_handler, + &zpl_xattr_acl_default_handler, +#endif /* CONFIG_FS_POSIX_ACL */ + NULL +}; + +static const struct xattr_handler * +zpl_xattr_handler(const char *name) +{ + if (strncmp(name, XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN) == 0) + return (&zpl_xattr_user_handler); + + if (strncmp(name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) == 0) + return (&zpl_xattr_trusted_handler); + + if (strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) == 0) + return (&zpl_xattr_security_handler); + +#ifdef CONFIG_FS_POSIX_ACL + if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0) + return (&zpl_xattr_acl_access_handler); + + if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) + return (&zpl_xattr_acl_default_handler); +#endif /* CONFIG_FS_POSIX_ACL */ + + return (NULL); +} + +#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) +struct acl_rel_struct { + struct acl_rel_struct *next; + struct posix_acl *acl; + clock_t time; +}; + +#define ACL_REL_GRACE (60*HZ) +#define ACL_REL_WINDOW (1*HZ) +#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) + +/* + * Lockless multi-producer single-consumer fifo list. + * Nodes are added to tail and removed from head. Tail pointer is our + * synchronization point. It always points to the next pointer of the last + * node, or head if list is empty. + */ +static struct acl_rel_struct *acl_rel_head = NULL; +static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; + +static void +zpl_posix_acl_free(void *arg) +{ + struct acl_rel_struct *freelist = NULL; + struct acl_rel_struct *a; + clock_t new_time; + boolean_t refire = B_FALSE; + + ASSERT3P(acl_rel_head, !=, NULL); + while (acl_rel_head) { + a = acl_rel_head; + if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { + /* + * If a is the last node we need to reset tail, but we + * need to use cmpxchg to make sure it is still the + * last node. + */ + if (acl_rel_tail == &a->next) { + acl_rel_head = NULL; + if (cmpxchg(&acl_rel_tail, &a->next, + &acl_rel_head) == &a->next) { + ASSERT3P(a->next, ==, NULL); + a->next = freelist; + freelist = a; + break; + } + } + /* + * a is not last node, make sure next pointer is set + * by the adder and advance the head. + */ + while (READ_ONCE(a->next) == NULL) + cpu_relax(); + acl_rel_head = a->next; + a->next = freelist; + freelist = a; + } else { + /* + * a is still in grace period. We are responsible to + * reschedule the free task, since adder will only do + * so if list is empty. + */ + new_time = a->time + ACL_REL_SCHED; + refire = B_TRUE; + break; + } + } + + if (refire) + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, new_time); + + while (freelist) { + a = freelist; + freelist = a->next; + kfree(a->acl); + kmem_free(a, sizeof (struct acl_rel_struct)); + } +} + +void +zpl_posix_acl_release_impl(struct posix_acl *acl) +{ + struct acl_rel_struct *a, **prev; + + a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); + a->next = NULL; + a->acl = acl; + a->time = ddi_get_lbolt(); + /* atomically points tail to us and get the previous tail */ + prev = xchg(&acl_rel_tail, &a->next); + ASSERT3P(*prev, ==, NULL); + *prev = a; + /* if it was empty before, schedule the free task */ + if (prev == &acl_rel_head) + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); +} +#endif |