aboutsummaryrefslogtreecommitdiffstats
path: root/module/os
diff options
context:
space:
mode:
Diffstat (limited to 'module/os')
-rw-r--r--module/os/Makefile.in1
-rw-r--r--module/os/linux/Makefile.in1
-rw-r--r--module/os/linux/spl/Makefile.in18
-rw-r--r--module/os/linux/spl/README.md16
-rw-r--r--module/os/linux/spl/THIRDPARTYLICENSE.gplv2339
-rw-r--r--module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip1
-rw-r--r--module/os/linux/spl/spl-atomic.c36
-rw-r--r--module/os/linux/spl/spl-condvar.c461
-rw-r--r--module/os/linux/spl/spl-cred.c200
-rw-r--r--module/os/linux/spl/spl-err.c124
-rw-r--r--module/os/linux/spl/spl-generic.c757
-rw-r--r--module/os/linux/spl/spl-kmem-cache.c1780
-rw-r--r--module/os/linux/spl/spl-kmem.c556
-rw-r--r--module/os/linux/spl/spl-kobj.c86
-rw-r--r--module/os/linux/spl/spl-kstat.c770
-rw-r--r--module/os/linux/spl/spl-proc.c782
-rw-r--r--module/os/linux/spl/spl-procfs-list.c257
-rw-r--r--module/os/linux/spl/spl-taskq.c1292
-rw-r--r--module/os/linux/spl/spl-thread.c163
-rw-r--r--module/os/linux/spl/spl-tsd.c720
-rw-r--r--module/os/linux/spl/spl-vmem.c135
-rw-r--r--module/os/linux/spl/spl-vnode.c719
-rw-r--r--module/os/linux/spl/spl-xdr.c513
-rw-r--r--module/os/linux/spl/spl-zlib.c217
-rw-r--r--module/os/linux/zfs/Makefile.in34
-rw-r--r--module/os/linux/zfs/abd.c1638
-rw-r--r--module/os/linux/zfs/policy.c355
-rw-r--r--module/os/linux/zfs/qat.c105
-rw-r--r--module/os/linux/zfs/qat_compress.c574
-rw-r--r--module/os/linux/zfs/qat_crypt.c631
-rw-r--r--module/os/linux/zfs/spa_stats.c1034
-rw-r--r--module/os/linux/zfs/vdev_disk.c954
-rw-r--r--module/os/linux/zfs/vdev_file.c331
-rw-r--r--module/os/linux/zfs/zfs_acl.c2816
-rw-r--r--module/os/linux/zfs/zfs_ctldir.c1240
-rw-r--r--module/os/linux/zfs/zfs_debug.c253
-rw-r--r--module/os/linux/zfs/zfs_dir.c1205
-rw-r--r--module/os/linux/zfs/zfs_sysfs.c661
-rw-r--r--module/os/linux/zfs/zfs_vfsops.c2562
-rw-r--r--module/os/linux/zfs/zfs_vnops.c5275
-rw-r--r--module/os/linux/zfs/zfs_znode.c2234
-rw-r--r--module/os/linux/zfs/zio_crypt.c2036
-rw-r--r--module/os/linux/zfs/zpl_ctldir.c572
-rw-r--r--module/os/linux/zfs/zpl_export.c177
-rw-r--r--module/os/linux/zfs/zpl_file.c1075
-rw-r--r--module/os/linux/zfs/zpl_inode.c826
-rw-r--r--module/os/linux/zfs/zpl_super.c426
-rw-r--r--module/os/linux/zfs/zpl_xattr.c1548
48 files changed, 38506 insertions, 0 deletions
diff --git a/module/os/Makefile.in b/module/os/Makefile.in
new file mode 100644
index 000000000..b9990d1bc
--- /dev/null
+++ b/module/os/Makefile.in
@@ -0,0 +1 @@
+subdirs-m = linux
diff --git a/module/os/linux/Makefile.in b/module/os/linux/Makefile.in
new file mode 100644
index 000000000..ab01708a3
--- /dev/null
+++ b/module/os/linux/Makefile.in
@@ -0,0 +1 @@
+subdirs-m = spl zfs
diff --git a/module/os/linux/spl/Makefile.in b/module/os/linux/spl/Makefile.in
new file mode 100644
index 000000000..a29c36a2a
--- /dev/null
+++ b/module/os/linux/spl/Makefile.in
@@ -0,0 +1,18 @@
+$(MODULE)-objs += ../os/linux/spl/spl-atomic.o
+$(MODULE)-objs += ../os/linux/spl/spl-condvar.o
+$(MODULE)-objs += ../os/linux/spl/spl-cred.o
+$(MODULE)-objs += ../os/linux/spl/spl-err.o
+$(MODULE)-objs += ../os/linux/spl/spl-generic.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o
+$(MODULE)-objs += ../os/linux/spl/spl-kobj.o
+$(MODULE)-objs += ../os/linux/spl/spl-kstat.o
+$(MODULE)-objs += ../os/linux/spl/spl-proc.o
+$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o
+$(MODULE)-objs += ../os/linux/spl/spl-taskq.o
+$(MODULE)-objs += ../os/linux/spl/spl-thread.o
+$(MODULE)-objs += ../os/linux/spl/spl-tsd.o
+$(MODULE)-objs += ../os/linux/spl/spl-vmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-vnode.o
+$(MODULE)-objs += ../os/linux/spl/spl-xdr.o
+$(MODULE)-objs += ../os/linux/spl/spl-zlib.o
diff --git a/module/os/linux/spl/README.md b/module/os/linux/spl/README.md
new file mode 100644
index 000000000..57f635aed
--- /dev/null
+++ b/module/os/linux/spl/README.md
@@ -0,0 +1,16 @@
+The Solaris Porting Layer, SPL, is a Linux kernel module which provides a
+compatibility layer used by the [ZFS on Linux](http://zfsonlinux.org) project.
+
+# Installation
+
+The latest version of the SPL is maintained as part of this repository.
+Only when building ZFS version 0.7.x or earlier must an external SPL release
+be used. These releases can be found at:
+
+ * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release
+ * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release
+
+# Release
+
+The SPL is released under a GPLv2 license.
+For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197`
diff --git a/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
new file mode 100644
index 000000000..d159169d1
--- /dev/null
+++ b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
new file mode 100644
index 000000000..78535a8ee
--- /dev/null
+++ b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
@@ -0,0 +1 @@
+COMPATIBILITY LAYER FOR OPENZFS ON LINUX
diff --git a/module/os/linux/spl/spl-atomic.c b/module/os/linux/spl/spl-atomic.c
new file mode 100644
index 000000000..47ed1886e
--- /dev/null
+++ b/module/os/linux/spl/spl-atomic.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Atomic Implementation.
+ */
+
+#include <sys/atomic.h>
+
+#ifdef ATOMIC_SPINLOCK
+/* Global atomic lock declarations */
+DEFINE_SPINLOCK(atomic32_lock);
+DEFINE_SPINLOCK(atomic64_lock);
+
+EXPORT_SYMBOL(atomic32_lock);
+EXPORT_SYMBOL(atomic64_lock);
+#endif /* ATOMIC_SPINLOCK */
diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c
new file mode 100644
index 000000000..3cc33da62
--- /dev/null
+++ b/module/os/linux/spl/spl-condvar.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <linux/hrtimer.h>
+#include <linux/compiler_compat.h>
+#include <linux/mod_compat.h>
+
+#include <linux/sched.h>
+
+#ifdef HAVE_SCHED_SIGNAL_HEADER
+#include <linux/sched/signal.h>
+#endif
+
+#define MAX_HRTIMEOUT_SLACK_US 1000
+unsigned int spl_schedule_hrtimeout_slack_us = 0;
+
+static int
+param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
+{
+ unsigned long val;
+ int error;
+
+ error = kstrtoul(buf, 0, &val);
+ if (error)
+ return (error);
+
+ if (val > MAX_HRTIMEOUT_SLACK_US)
+ return (-EINVAL);
+
+ error = param_set_uint(buf, kp);
+ if (error < 0)
+ return (error);
+
+ return (0);
+}
+
+module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
+ param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
+MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
+ "schedule_hrtimeout_range() delta/slack value in us, default(0)");
+
+void
+__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
+{
+ ASSERT(cvp);
+ ASSERT(name == NULL);
+ ASSERT(type == CV_DEFAULT);
+ ASSERT(arg == NULL);
+
+ cvp->cv_magic = CV_MAGIC;
+ init_waitqueue_head(&cvp->cv_event);
+ init_waitqueue_head(&cvp->cv_destroy);
+ atomic_set(&cvp->cv_waiters, 0);
+ atomic_set(&cvp->cv_refs, 1);
+ cvp->cv_mutex = NULL;
+}
+EXPORT_SYMBOL(__cv_init);
+
+static int
+cv_destroy_wakeup(kcondvar_t *cvp)
+{
+ if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
+ ASSERT(cvp->cv_mutex == NULL);
+ ASSERT(!waitqueue_active(&cvp->cv_event));
+ return (1);
+ }
+
+ return (0);
+}
+
+void
+__cv_destroy(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+
+ cvp->cv_magic = CV_DESTROY;
+ atomic_dec(&cvp->cv_refs);
+
+ /* Block until all waiters are woken and references dropped. */
+ while (cv_destroy_wakeup(cvp) == 0)
+ wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
+
+ ASSERT3P(cvp->cv_mutex, ==, NULL);
+ ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
+ ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
+ ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
+}
+EXPORT_SYMBOL(__cv_destroy);
+
+static void
+cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+ atomic_inc(&cvp->cv_refs);
+
+ m = READ_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ io_schedule();
+ else
+ schedule();
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+}
+
+void
+__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait);
+
+void
+__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
+}
+EXPORT_SYMBOL(__cv_wait_io);
+
+int
+__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1);
+
+ return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_io_sig);
+
+int
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+ cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+
+ return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_sig);
+
+#if defined(HAVE_IO_SCHEDULE_TIMEOUT)
+#define spl_io_schedule_timeout(t) io_schedule_timeout(t)
+#else
+
+struct spl_task_timer {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+
+static void
+__cv_wakeup(spl_timer_list_t t)
+{
+ struct timer_list *tmr = (struct timer_list *)t;
+ struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer);
+
+ wake_up_process(task_timer->task);
+}
+
+static long
+spl_io_schedule_timeout(long time_left)
+{
+ long expire_time = jiffies + time_left;
+ struct spl_task_timer task_timer;
+ struct timer_list *timer = &task_timer.timer;
+
+ task_timer.task = current;
+
+ timer_setup(timer, __cv_wakeup, 0);
+
+ timer->expires = expire_time;
+ add_timer(timer);
+
+ io_schedule();
+
+ del_timer_sync(timer);
+
+ time_left = expire_time - jiffies;
+
+ return (time_left < 0 ? 0 : time_left);
+}
+#endif
+
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+ int state, int io)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ clock_t time_left;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ /* XXX - Does not handle jiffie wrap properly */
+ time_left = expire_time - jiffies;
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = READ_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+ if (io)
+ time_left = spl_io_schedule_timeout(time_left);
+ else
+ time_left = schedule_timeout(time_left);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ /*
+ * Hold mutex after we release the cvp, otherwise we could dead lock
+ * with a thread holding the mutex and call cv_destroy.
+ */
+ mutex_enter(mp);
+ return (time_left > 0 ? time_left : -1);
+}
+
+clock_t
+__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait);
+
+clock_t
+__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_UNINTERRUPTIBLE, 1));
+}
+EXPORT_SYMBOL(__cv_timedwait_io);
+
+clock_t
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+ return (__cv_timedwait_common(cvp, mp, exp_time,
+ TASK_INTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait_sig);
+
+/*
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+ hrtime_t res, int state)
+{
+ DEFINE_WAIT(wait);
+ kmutex_t *m;
+ hrtime_t time_left;
+ ktime_t ktime_left;
+ u64 slack = 0;
+
+ ASSERT(cvp);
+ ASSERT(mp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ ASSERT(mutex_owned(mp));
+
+ time_left = expire_time - gethrtime();
+ if (time_left <= 0)
+ return (-1);
+
+ atomic_inc(&cvp->cv_refs);
+ m = READ_ONCE(cvp->cv_mutex);
+ if (!m)
+ m = xchg(&cvp->cv_mutex, mp);
+ /* Ensure the same mutex is used by all callers */
+ ASSERT(m == NULL || m == mp);
+
+ prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+ atomic_inc(&cvp->cv_waiters);
+
+ /*
+ * Mutex should be dropped after prepare_to_wait() this
+ * ensures we're linked in to the waiters list and avoids the
+ * race where 'cvp->cv_waiters > 0' but the list is empty.
+ */
+ mutex_exit(mp);
+
+ ktime_left = ktime_set(0, time_left);
+ slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
+ MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
+ schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);
+
+ /* No more waiters a different mutex could be used */
+ if (atomic_dec_and_test(&cvp->cv_waiters)) {
+ /*
+ * This is set without any lock, so it's racy. But this is
+ * just for debug anyway, so make it best-effort
+ */
+ cvp->cv_mutex = NULL;
+ wake_up(&cvp->cv_destroy);
+ }
+
+ finish_wait(&cvp->cv_event, &wait);
+ atomic_dec(&cvp->cv_refs);
+
+ mutex_enter(mp);
+ time_left = expire_time - gethrtime();
+ return (time_left > 0 ? NSEC_TO_TICK(time_left) : -1);
+}
+
+/*
+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
+ */
+static clock_t
+cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag, int state)
+{
+ if (!(flag & CALLOUT_FLAG_ABSOLUTE))
+ tim += gethrtime();
+
+ return (__cv_timedwait_hires(cvp, mp, tim, res, state));
+}
+
+clock_t
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+ int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_UNINTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_hires);
+
+clock_t
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
+{
+ return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+ TASK_INTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_sig_hires);
+
+void
+__cv_signal(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+ * waiter will be set runnable with each call to wake_up().
+ * Additionally wake_up() holds a spin_lock associated with
+ * the wait queue to ensure we don't race waking up processes.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_signal);
+
+void
+__cv_broadcast(kcondvar_t *cvp)
+{
+ ASSERT(cvp);
+ ASSERT(cvp->cv_magic == CV_MAGIC);
+ atomic_inc(&cvp->cv_refs);
+
+ /*
+ * Wake_up_all() will wake up all waiters even those which
+ * have the WQ_FLAG_EXCLUSIVE flag set.
+ */
+ if (atomic_read(&cvp->cv_waiters) > 0)
+ wake_up_all(&cvp->cv_event);
+
+ atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_broadcast);
diff --git a/module/os/linux/spl/spl-cred.c b/module/os/linux/spl/spl-cred.c
new file mode 100644
index 000000000..ea3e903f9
--- /dev/null
+++ b/module/os/linux/spl/spl-cred.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/cred.h>
+
+static int
+#ifdef HAVE_KUIDGID_T
+cr_groups_search(const struct group_info *group_info, kgid_t grp)
+#else
+cr_groups_search(const struct group_info *group_info, gid_t grp)
+#endif
+{
+ unsigned int left, right, mid;
+ int cmp;
+
+ if (!group_info)
+ return (0);
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ mid = (left + right) / 2;
+ cmp = KGID_TO_SGID(grp) -
+ KGID_TO_SGID(GROUP_AT(group_info, mid));
+
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return (1);
+ }
+ return (0);
+}
+
+/* Hold a reference on the credential */
+void
+crhold(cred_t *cr)
+{
+ (void) get_cred((const cred_t *)cr);
+}
+
+/* Free a reference on the credential */
+void
+crfree(cred_t *cr)
+{
+ put_cred((const cred_t *)cr);
+}
+
+/* Return the number of supplemental groups */
+int
+crgetngroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = gi->ngroups;
+#ifndef HAVE_GROUP_INFO_GID
+ /*
+ * For Linux <= 4.8,
+ * crgetgroups will only returns gi->blocks[0], which contains only
+ * the first NGROUPS_PER_BLOCK groups.
+ */
+ if (rc > NGROUPS_PER_BLOCK) {
+ WARN_ON_ONCE(1);
+ rc = NGROUPS_PER_BLOCK;
+ }
+#endif
+ return (rc);
+}
+
+/*
+ * Return an array of supplemental gids. The returned address is safe
+ * to use as long as the caller has taken a reference with crhold().
+ *
+ * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d
+ * array via ->gid.
+ */
+gid_t *
+crgetgroups(const cred_t *cr)
+{
+ struct group_info *gi;
+ gid_t *gids = NULL;
+
+ gi = cr->group_info;
+#ifdef HAVE_GROUP_INFO_GID
+ gids = KGIDP_TO_SGIDP(gi->gid);
+#else
+ if (gi->nblocks > 0)
+ gids = KGIDP_TO_SGIDP(gi->blocks[0]);
+#endif
+ return (gids);
+}
+
+/* Check if the passed gid is available in supplied credential. */
+int
+groupmember(gid_t gid, const cred_t *cr)
+{
+ struct group_info *gi;
+ int rc;
+
+ gi = cr->group_info;
+ rc = cr_groups_search(gi, SGID_TO_KGID(gid));
+
+ return (rc);
+}
+
+/* Return the effective user id */
+uid_t
+crgetuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->euid));
+}
+
+/* Return the real user id */
+uid_t
+crgetruid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->uid));
+}
+
+/* Return the saved user id */
+uid_t
+crgetsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->suid));
+}
+
+/* Return the filesystem user id */
+uid_t
+crgetfsuid(const cred_t *cr)
+{
+ return (KUID_TO_SUID(cr->fsuid));
+}
+
+/* Return the effective group id */
+gid_t
+crgetgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->egid));
+}
+
+/* Return the real group id */
+gid_t
+crgetrgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->gid));
+}
+
+/* Return the saved group id */
+gid_t
+crgetsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->sgid));
+}
+
+/* Return the filesystem group id */
+gid_t
+crgetfsgid(const cred_t *cr)
+{
+ return (KGID_TO_SGID(cr->fsgid));
+}
+
+EXPORT_SYMBOL(crhold);
+EXPORT_SYMBOL(crfree);
+EXPORT_SYMBOL(crgetuid);
+EXPORT_SYMBOL(crgetruid);
+EXPORT_SYMBOL(crgetsuid);
+EXPORT_SYMBOL(crgetfsuid);
+EXPORT_SYMBOL(crgetgid);
+EXPORT_SYMBOL(crgetrgid);
+EXPORT_SYMBOL(crgetsgid);
+EXPORT_SYMBOL(crgetfsgid);
+EXPORT_SYMBOL(crgetngroups);
+EXPORT_SYMBOL(crgetgroups);
+EXPORT_SYMBOL(groupmember);
diff --git a/module/os/linux/spl/spl-err.c b/module/os/linux/spl/spl-err.c
new file mode 100644
index 000000000..3c0bb71c0
--- /dev/null
+++ b/module/os/linux/spl/spl-err.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Error Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+
+/*
+ * It is often useful to actually have the panic crash the node so you
+ * can then get notified of the event, get the crashdump for later
+ * analysis and other such goodies.
+ * But we would still default to the current default of not to do that.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_panic_halt;
+module_param(spl_panic_halt, uint, 0644);
+MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
+/* END CSTYLED */
+
+void
+spl_dumpstack(void)
+{
+ printk("Showing stack for process %d\n", current->pid);
+ dump_stack();
+}
+EXPORT_SYMBOL(spl_dumpstack);
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+ const char *newfile;
+ char msg[MAXMSGLEN];
+ va_list ap;
+
+ newfile = strrchr(file, '/');
+ if (newfile != NULL)
+ newfile = newfile + 1;
+ else
+ newfile = file;
+
+ va_start(ap, fmt);
+ (void) vsnprintf(msg, sizeof (msg), fmt, ap);
+ va_end(ap);
+
+ printk(KERN_EMERG "%s", msg);
+ printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
+ if (spl_panic_halt)
+ panic("%s", msg);
+
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+
+ /* Unreachable */
+ return (1);
+}
+EXPORT_SYMBOL(spl_panic);
+
+void
+vcmn_err(int ce, const char *fmt, va_list ap)
+{
+ char msg[MAXMSGLEN];
+
+ vsnprintf(msg, MAXMSGLEN, fmt, ap);
+
+ switch (ce) {
+ case CE_IGNORE:
+ break;
+ case CE_CONT:
+ printk("%s", msg);
+ break;
+ case CE_NOTE:
+ printk(KERN_NOTICE "NOTICE: %s\n", msg);
+ break;
+ case CE_WARN:
+ printk(KERN_WARNING "WARNING: %s\n", msg);
+ break;
+ case CE_PANIC:
+ printk(KERN_EMERG "PANIC: %s\n", msg);
+ spl_dumpstack();
+
+ /* Halt the thread to facilitate further debugging */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ while (1)
+ schedule();
+ }
+} /* vcmn_err() */
+EXPORT_SYMBOL(vcmn_err);
+
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vcmn_err(ce, fmt, ap);
+ va_end(ap);
+} /* cmn_err() */
+EXPORT_SYMBOL(cmn_err);
diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c
new file mode 100644
index 000000000..1deb2f444
--- /dev/null
+++ b/module/os/linux/spl/spl-generic.c
@@ -0,0 +1,757 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/systeminfo.h>
+#include <sys/vmsystm.h>
+#include <sys/kobj.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/taskq.h>
+#include <sys/tsd.h>
+#include <sys/zmod.h>
+#include <sys/debug.h>
+#include <sys/proc.h>
+#include <sys/kstat.h>
+#include <sys/file.h>
+#include <linux/ctype.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/strings.h>
+#include <linux/kmod.h>
+#include "zfs_gitrev.h"
+
+char spl_gitrev[64] = ZFS_META_GITREV;
+
+/* BEGIN CSTYLED */
+unsigned long spl_hostid = 0;
+EXPORT_SYMBOL(spl_hostid);
+/* BEGIN CSTYLED */
+module_param(spl_hostid, ulong, 0644);
+MODULE_PARM_DESC(spl_hostid, "The system hostid.");
+/* END CSTYLED */
+
+proc_t p0;
+EXPORT_SYMBOL(p0);
+
+/*
+ * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *
+ * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
+ * is to provide bytes containing random numbers. It is mapped to /dev/urandom
+ * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
+ * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
+ * we can implement it using a fast PRNG that we seed using Linux' actual
+ * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
+ * with an independent seed so that all calls to random_get_pseudo_bytes() are
+ * free of atomic instructions.
+ *
+ * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
+ * to generate words larger than 128 bits will paradoxically be limited to
+ * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
+ * 128-bit words and selecting the first will implicitly select the second. If
+ * a caller finds this behavior undesirable, random_get_bytes() should be used
+ * instead.
+ *
+ * XXX: Linux interrupt handlers that trigger within the critical section
+ * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * see the same numbers. Nothing in the code currently calls this in an
+ * interrupt handler, so this is considered to be okay. If that becomes a
+ * problem, we could create a set of per-cpu variables for interrupt handlers
+ * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
+ * true.
+ */
+static DEFINE_PER_CPU(uint64_t[2], spl_pseudo_entropy);
+
+/*
+ * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
+ * file:
+ *
+ * http://xorshift.di.unimi.it/xorshift128plus.c
+ */
+
+static inline uint64_t
+spl_rand_next(uint64_t *s)
+{
+ uint64_t s1 = s[0];
+ const uint64_t s0 = s[1];
+ s[0] = s0;
+ s1 ^= s1 << 23; // a
+ s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
+ return (s[1] + s0);
+}
+
+static inline void
+spl_rand_jump(uint64_t *s)
+{
+ static const uint64_t JUMP[] =
+ { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+
+ uint64_t s0 = 0;
+ uint64_t s1 = 0;
+ int i, b;
+ for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
+ for (b = 0; b < 64; b++) {
+ if (JUMP[i] & 1ULL << b) {
+ s0 ^= s[0];
+ s1 ^= s[1];
+ }
+ (void) spl_rand_next(s);
+ }
+
+ s[0] = s0;
+ s[1] = s1;
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+ uint64_t *xp, s[2];
+
+ ASSERT(ptr);
+
+ xp = get_cpu_var(spl_pseudo_entropy);
+
+ s[0] = xp[0];
+ s[1] = xp[1];
+
+ while (len) {
+ union {
+ uint64_t ui64;
+ uint8_t byte[sizeof (uint64_t)];
+ }entropy;
+ int i = MIN(len, sizeof (uint64_t));
+
+ len -= i;
+ entropy.ui64 = spl_rand_next(s);
+
+ while (i--)
+ *ptr++ = entropy.byte[i];
+ }
+
+ xp[0] = s[0];
+ xp[1] = s[1];
+
+ put_cpu_var(spl_pseudo_entropy);
+
+ return (0);
+}
+
+
+EXPORT_SYMBOL(random_get_pseudo_bytes);
+
+#if BITS_PER_LONG == 32
+/*
+ * Support 64/64 => 64 division on a 32-bit platform. While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed. There are cases which return incorrect
+ * results as late as linux-2.6.35. Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'. The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+ register int n = 0;
+
+ if (x == 0)
+ return (64);
+
+ if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+ if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+ if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; }
+ if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; }
+ if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; }
+ if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; }
+
+ return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portability between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+ (void) do_div(u, v);
+ return (u);
+}
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+ uint64_t u0, u1, v1, q0, q1, k;
+ int n;
+
+ if (v >> 32 == 0) { // If v < 2**32:
+ if (u >> 32 < v) { // If u/v cannot overflow,
+ return (__div_u64(u, v)); // just do one division.
+ } else { // If u/v would overflow:
+ u1 = u >> 32; // Break u into two halves.
+ u0 = u & 0xFFFFFFFF;
+ q1 = __div_u64(u1, v); // First quotient digit.
+ k = u1 - q1 * v; // First remainder, < v.
+ u0 += (k << 32);
+ q0 = __div_u64(u0, v); // Seconds quotient digit.
+ return ((q1 << 32) + q0);
+ }
+ } else { // If v >= 2**32:
+ n = nlz64(v); // 0 <= n <= 31.
+ v1 = (v << n) >> 32; // Normalize divisor, MSB is 1.
+ u1 = u >> 1; // To ensure no overflow.
+ q1 = __div_u64(u1, v1); // Get quotient from
+ q0 = (q1 << n) >> 31; // Undo normalization and
+ // division of u by 2.
+ if (q0 != 0) // Make q0 correct or
+ q0 = q0 - 1; // too small by 1.
+ if ((u - q0 * v) >= v)
+ q0 = q0 + 1; // Now q0 is correct.
+
+ return (q0);
+ }
+}
+EXPORT_SYMBOL(__udivdi3);
+
+/* BEGIN CSTYLED */
+#ifndef abs64
+#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+/* END CSTYLED */
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+ int64_t q, t;
+ q = __udivdi3(abs64(u), abs64(v));
+ t = (u ^ v) >> 63; // If u, v have different
+ return ((q ^ t) - t); // signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+ return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+ uint64_t q = __udivdi3(n, d);
+ if (r)
+ *r = n - d * q;
+ return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+ int64_t q, rr;
+ boolean_t nn = B_FALSE;
+ boolean_t nd = B_FALSE;
+ if (n < 0) {
+ nn = B_TRUE;
+ n = -n;
+ }
+ if (d < 0) {
+ nd = B_TRUE;
+ d = -d;
+ }
+
+ q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+ if (nn != nd)
+ q = -q;
+ if (nn)
+ rr = -rr;
+ if (r)
+ *r = rr;
+ return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}. The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return. All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+ uint64_t res;
+ uint64_t mod;
+
+ res = __udivdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+ int64_t res;
+ uint64_t mod;
+
+ res = __divdi3(u, v);
+ mod = __umoddi3(u, v);
+ {
+ register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+ register uint32_t r1 asm("r1") = (res >> 32);
+ register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+ register uint32_t r3 asm("r3") = (mod >> 32);
+
+ /* BEGIN CSTYLED */
+ asm volatile(""
+ : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */
+ : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */
+ /* END CSTYLED */
+
+ return; /* r0; */
+ }
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+#endif /* BITS_PER_LONG */
+
+/*
+ * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
+ * ddi_strtol(9F) man page. I have not verified the behavior of these
+ * functions against their Solaris counterparts. It is possible that I
+ * may have misinterpreted the man page or the man page is incorrect.
+ */
+int ddi_strtoul(const char *, char **, int, unsigned long *);
+int ddi_strtol(const char *, char **, int, long *);
+int ddi_strtoull(const char *, char **, int, unsigned long long *);
+int ddi_strtoll(const char *, char **, int, long long *);
+
+#define define_ddi_strtoux(type, valtype) \
+int ddi_strtou##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ valtype last_value, value = 0; \
+ char *ptr = (char *)str; \
+ int flag = 1, digit; \
+ \
+ if (strlen(ptr) == 0) \
+ return (EINVAL); \
+ \
+ /* Auto-detect base based on prefix */ \
+ if (!base) { \
+ if (str[0] == '0') { \
+ if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
+ base = 16; /* hex */ \
+ ptr += 2; \
+ } else if (str[1] >= '0' && str[1] < 8) { \
+ base = 8; /* octal */ \
+ ptr += 1; \
+ } else { \
+ return (EINVAL); \
+ } \
+ } else { \
+ base = 10; /* decimal */ \
+ } \
+ } \
+ \
+ while (1) { \
+ if (isdigit(*ptr)) \
+ digit = *ptr - '0'; \
+ else if (isalpha(*ptr)) \
+ digit = tolower(*ptr) - 'a' + 10; \
+ else \
+ break; \
+ \
+ if (digit >= base) \
+ break; \
+ \
+ last_value = value; \
+ value = value * base + digit; \
+ if (last_value > value) /* Overflow */ \
+ return (ERANGE); \
+ \
+ flag = 1; \
+ ptr++; \
+ } \
+ \
+ if (flag) \
+ *result = value; \
+ \
+ if (endptr) \
+ *endptr = (char *)(flag ? ptr : str); \
+ \
+ return (0); \
+} \
+
+#define define_ddi_strtox(type, valtype) \
+int ddi_strto##type(const char *str, char **endptr, \
+ int base, valtype *result) \
+{ \
+ int rc; \
+ \
+ if (*str == '-') { \
+ rc = ddi_strtou##type(str + 1, endptr, base, result); \
+ if (!rc) { \
+ if (*endptr == str + 1) \
+ *endptr = (char *)str; \
+ else \
+ *result = -*result; \
+ } \
+ } else { \
+ rc = ddi_strtou##type(str, endptr, base, result); \
+ } \
+ \
+ return (rc); \
+}
+
+define_ddi_strtoux(l, unsigned long)
+define_ddi_strtox(l, long)
+define_ddi_strtoux(ll, unsigned long long)
+define_ddi_strtox(ll, long long)
+
+EXPORT_SYMBOL(ddi_strtoul);
+EXPORT_SYMBOL(ddi_strtol);
+EXPORT_SYMBOL(ddi_strtoll);
+EXPORT_SYMBOL(ddi_strtoull);
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyin(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyin);
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+ /* Fake ioctl() issued by kernel, 'from' is a kernel address */
+ if (flags & FKIOCTL) {
+ memcpy(to, from, len);
+ return (0);
+ }
+
+ return (copyout(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyout);
+
+/*
+ * Read the unique system identifier from the /etc/hostid file.
+ *
+ * The behavior of /usr/bin/hostid on Linux systems with the
+ * regular eglibc and coreutils is:
+ *
+ * 1. Generate the value if the /etc/hostid file does not exist
+ * or if the /etc/hostid file is less than four bytes in size.
+ *
+ * 2. If the /etc/hostid file is at least 4 bytes, then return
+ * the first four bytes [0..3] in native endian order.
+ *
+ * 3. Always ignore bytes [4..] if they exist in the file.
+ *
+ * Only the first four bytes are significant, even on systems that
+ * have a 64-bit word size.
+ *
+ * See:
+ *
+ * eglibc: sysdeps/unix/sysv/linux/gethostid.c
+ * coreutils: src/hostid.c
+ *
+ * Notes:
+ *
+ * The /etc/hostid file on Solaris is a text file that often reads:
+ *
+ * # DO NOT EDIT
+ * "0123456789"
+ *
+ * Directly copying this file to Linux results in a constant
+ * hostid of 4f442023 because the default comment constitutes
+ * the first four bytes of the file.
+ *
+ */
+
+char *spl_hostid_path = HW_HOSTID_PATH;
+module_param(spl_hostid_path, charp, 0444);
+MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
+
+static int
+hostid_read(uint32_t *hostid)
+{
+ uint64_t size;
+ struct _buf *file;
+ uint32_t value = 0;
+ int error;
+
+ file = kobj_open_file(spl_hostid_path);
+ if (file == (struct _buf *)-1)
+ return (ENOENT);
+
+ error = kobj_get_filesize(file, &size);
+ if (error) {
+ kobj_close_file(file);
+ return (error);
+ }
+
+ if (size < sizeof (HW_HOSTID_MASK)) {
+ kobj_close_file(file);
+ return (EINVAL);
+ }
+
+ /*
+ * Read directly into the variable like eglibc does.
+ * Short reads are okay; native behavior is preserved.
+ */
+ error = kobj_read_file(file, (char *)&value, sizeof (value), 0);
+ if (error < 0) {
+ kobj_close_file(file);
+ return (EIO);
+ }
+
+ /* Mask down to 32 bits like coreutils does. */
+ *hostid = (value & HW_HOSTID_MASK);
+ kobj_close_file(file);
+
+ return (0);
+}
+
+/*
+ * Return the system hostid. Preferentially use the spl_hostid module option
+ * when set, otherwise use the value in the /etc/hostid file.
+ */
+uint32_t
+zone_get_hostid(void *zone)
+{
+ uint32_t hostid;
+
+ ASSERT3P(zone, ==, NULL);
+
+ if (spl_hostid != 0)
+ return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
+
+ if (hostid_read(&hostid) == 0)
+ return (hostid);
+
+ return (0);
+}
+EXPORT_SYMBOL(zone_get_hostid);
+
+static int
+spl_kvmem_init(void)
+{
+ int rc = 0;
+
+ rc = spl_kmem_init();
+ if (rc)
+ return (rc);
+
+ rc = spl_vmem_init();
+ if (rc) {
+ spl_kmem_fini();
+ return (rc);
+ }
+
+ return (rc);
+}
+
+/*
+ * We initialize the random number generator with 128 bits of entropy from the
+ * system random number generator. In the improbable case that we have a zero
+ * seed, we fallback to the system jiffies, unless it is also zero, in which
+ * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
+ * initialize each of the per-cpu seeds so that the sequences generated on each
+ * CPU are guaranteed to never overlap in practice.
+ */
+static void __init
+spl_random_init(void)
+{
+ uint64_t s[2];
+ int i;
+
+ get_random_bytes(s, sizeof (s));
+
+ if (s[0] == 0 && s[1] == 0) {
+ if (jiffies != 0) {
+ s[0] = jiffies;
+ s[1] = ~0 - jiffies;
+ } else {
+ (void) memcpy(s, "improbable seed", sizeof (s));
+ }
+ printk("SPL: get_random_bytes() returned 0 "
+ "when generating random seed. Setting initial seed to "
+ "0x%016llx%016llx.\n", cpu_to_be64(s[0]),
+ cpu_to_be64(s[1]));
+ }
+
+ for_each_possible_cpu(i) {
+ uint64_t *wordp = per_cpu(spl_pseudo_entropy, i);
+
+ spl_rand_jump(s);
+
+ wordp[0] = s[0];
+ wordp[1] = s[1];
+ }
+}
+
+static void
+spl_kvmem_fini(void)
+{
+ spl_vmem_fini();
+ spl_kmem_fini();
+}
+
+static int __init
+spl_init(void)
+{
+ int rc = 0;
+
+ bzero(&p0, sizeof (proc_t));
+ spl_random_init();
+
+ if ((rc = spl_kvmem_init()))
+ goto out1;
+
+ if ((rc = spl_tsd_init()))
+ goto out2;
+
+ if ((rc = spl_taskq_init()))
+ goto out3;
+
+ if ((rc = spl_kmem_cache_init()))
+ goto out4;
+
+ if ((rc = spl_vn_init()))
+ goto out5;
+
+ if ((rc = spl_proc_init()))
+ goto out6;
+
+ if ((rc = spl_kstat_init()))
+ goto out7;
+
+ if ((rc = spl_zlib_init()))
+ goto out8;
+
+ return (rc);
+
+out8:
+ spl_kstat_fini();
+out7:
+ spl_proc_fini();
+out6:
+ spl_vn_fini();
+out5:
+ spl_kmem_cache_fini();
+out4:
+ spl_taskq_fini();
+out3:
+ spl_tsd_fini();
+out2:
+ spl_kvmem_fini();
+out1:
+ return (rc);
+}
+
+static void __exit
+spl_fini(void)
+{
+ spl_zlib_fini();
+ spl_kstat_fini();
+ spl_proc_fini();
+ spl_vn_fini();
+ spl_kmem_cache_fini();
+ spl_taskq_fini();
+ spl_tsd_fini();
+ spl_kvmem_fini();
+}
+
+module_init(spl_init);
+module_exit(spl_fini);
+
+MODULE_DESCRIPTION("Solaris Porting Layer");
+MODULE_AUTHOR(ZFS_META_AUTHOR);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c
new file mode 100644
index 000000000..b39867b03
--- /dev/null
+++ b/module/os/linux/spl/spl-kmem-cache.c
@@ -0,0 +1,1780 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <sys/taskq.h>
+#include <sys/timer.h>
+#include <sys/vmem.h>
+#include <sys/wait.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/prefetch.h>
+
+/*
+ * Within the scope of spl-kmem.c file the kmem_cache_* definitions
+ * are removed to allow access to the real Linux slab allocator.
+ */
+#undef kmem_cache_destroy
+#undef kmem_cache_create
+#undef kmem_cache_alloc
+#undef kmem_cache_free
+
+
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
+/*
+ * Cache expiration was implemented because it was part of the default Solaris
+ * kmem_cache behavior. The idea is that per-cpu objects which haven't been
+ * accessed in several seconds should be returned to the cache. On the other
+ * hand Linux slabs never move objects back to the slabs unless there is
+ * memory pressure on the system. By default the Linux method is enabled
+ * because it has been shown to improve responsiveness on low memory systems.
+ * This policy may be changed by setting KMC_EXPIRE_AGE or KMC_EXPIRE_MEM.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_cache_expire = KMC_EXPIRE_MEM;
+EXPORT_SYMBOL(spl_kmem_cache_expire);
+module_param(spl_kmem_cache_expire, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_expire, "By age (0x1) or low memory (0x2)");
+
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory. They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches. However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size. When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size. Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+ "Default magazine size (2-256), set automatically (0)");
+
+/*
+ * The default behavior is to report the number of objects remaining in the
+ * cache. This allows the Linux VM to repeatedly reclaim objects from the
+ * cache when memory is low satisfy other memory allocations. Alternately,
+ * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
+ * is reclaimed. This may increase the likelihood of out of memory events.
+ */
+unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
+module_param(spl_kmem_cache_reclaim, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
+
+unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
+
+unsigned int spl_kmem_cache_obj_per_slab_min = SPL_KMEM_CACHE_OBJ_PER_SLAB_MIN;
+module_param(spl_kmem_cache_obj_per_slab_min, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab_min,
+ "Minimal number of objects per slab");
+
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
+module_param(spl_kmem_cache_max_size, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
+
+/*
+ * For small objects the Linux slab allocator should be used to make the most
+ * efficient use of the memory. However, large objects are not supported by
+ * the Linux slab and therefore the SPL implementation is preferred. A cutoff
+ * of 16K was determined to be optimal for architectures using 4K pages.
+ */
+#if PAGE_SIZE == 4096
+unsigned int spl_kmem_cache_slab_limit = 16384;
+#else
+unsigned int spl_kmem_cache_slab_limit = 0;
+#endif
+module_param(spl_kmem_cache_slab_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
+ "Objects less than N bytes use the Linux slab");
+
+/*
+ * This value defaults to a threshold designed to avoid allocations which
+ * have been deemed costly by the kernel.
+ */
+unsigned int spl_kmem_cache_kmem_limit =
+ ((1 << (PAGE_ALLOC_COSTLY_ORDER - 1)) * PAGE_SIZE) /
+ SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_kmem_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_limit,
+ "Objects less than N bytes use the kmalloc");
+
+/*
+ * The number of threads available to allocate new slabs for caches. This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+ "Number of spl_kmem_cache threads");
+/* END CSTYLED */
+
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implementation I cannot use it to emulate the Solaris APIs. I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors. Recent versions of the Linux
+ * kernel have removed support for destructors. This is a deal
+ * breaker for the SPL which contains particularly expensive
+ * initializers for mutex's, condition variables, etc. We also
+ * require a minimal level of cleanup for these data types unlike
+ * many Linux data types which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab. Callers of the Solaris slab
+ * expect it to work well for both small are very large allocations.
+ * Because of memory fragmentation the Linux slab which is backed
+ * by kmalloc'ed memory performs very badly when confronted with
+ * large numbers of large allocations. Basing the slab on the
+ * virtual address space removes the need for contiguous pages
+ * and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features. It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches. This will seriously
+ * constrain the size of the slab caches and their performance.
+ */
+
+struct list_head spl_kmem_cache_list; /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */
+
+static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
+
+SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
+SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
+ spl_kmem_cache_generic_shrinker, KMC_DEFAULT_SEEKS);
+
+static void *
+kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ void *ptr;
+
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ ptr = (void *)__get_free_pages(lflags, get_order(size));
+ } else {
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM, PAGE_KERNEL);
+ }
+
+ /* Resulting allocated memory will be page aligned */
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ return (ptr);
+}
+
+static void
+kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
+{
+ ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+ /*
+ * The Linux direct reclaim path uses this out of band value to
+ * determine if forward progress is being made. Normally this is
+ * incremented by kmem_freepages() which is part of the various
+ * Linux slab implementations. However, since we are using none
+ * of that infrastructure we are responsible for incrementing it.
+ */
+ if (current->reclaim_state)
+ current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
+ if (skc->skc_flags & KMC_KMEM) {
+ ASSERT(ISP2(size));
+ free_pages((unsigned long)ptr, get_order(size));
+ } else {
+ vfree(ptr);
+ }
+}
+
+/*
+ * Required space for each aligned sks.
+ */
+static inline uint32_t
+spl_sks_size(spl_kmem_cache_t *skc)
+{
+ return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each aligned object.
+ */
+static inline uint32_t
+spl_obj_size(spl_kmem_cache_t *skc)
+{
+ uint32_t align = skc->skc_obj_align;
+
+ return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+ P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
+}
+
+/*
+ * Lookup the spl_kmem_object_t for an object given that object.
+ */
+static inline spl_kmem_obj_t *
+spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
+{
+ return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+ skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each offslab object taking in to account alignment
+ * restrictions and the power-of-two requirement of kv_alloc().
+ */
+static inline uint32_t
+spl_offslab_size(spl_kmem_cache_t *skc)
+{
+ return (1UL << (fls64(spl_obj_size(skc)) + 1));
+}
+
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
+ * actual objects in to one large address space to minimize the number
+ * of calls to the allocator. It is far better to do a few large
+ * allocations and then subdivide it ourselves. Now which allocator
+ * we use requires balancing a few trade offs.
+ *
+ * For small objects we use kmem_alloc() because as long as you are
+ * only requesting a small number of pages (ideally just one) its cheap.
+ * However, when you start requesting multiple pages with kmem_alloc()
+ * it gets increasingly expensive since it requires contiguous pages.
+ * For this reason we shift to vmem_alloc() for slabs of large objects
+ * which removes the need for contiguous pages. We do not use
+ * vmem_alloc() in all cases because there is significant locking
+ * overhead in __get_vm_area_node(). This function takes a single
+ * global lock when acquiring an available virtual address range which
+ * serializes all vmem_alloc()'s for all slab caches. Using slightly
+ * different allocation functions for small and large objects should
+ * give us the best of both worlds.
+ *
+ * KMC_ONSLAB KMC_OFFSLAB
+ *
+ * +------------------------+ +-----------------+
+ * | spl_kmem_slab_t --+-+ | | spl_kmem_slab_t |---+-+
+ * | skc_obj_size <-+ | | +-----------------+ | |
+ * | spl_kmem_obj_t | | | |
+ * | skc_obj_size <---+ | +-----------------+ | |
+ * | spl_kmem_obj_t | | | skc_obj_size | <-+ |
+ * | ... v | | spl_kmem_obj_t | |
+ * +------------------------+ +-----------------+ v
+ */
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+ spl_kmem_obj_t *sko, *n;
+ void *base, *obj;
+ uint32_t obj_size, offslab_size = 0;
+ int i, rc = 0;
+
+ base = kv_alloc(skc, skc->skc_slab_size, flags);
+ if (base == NULL)
+ return (NULL);
+
+ sks = (spl_kmem_slab_t *)base;
+ sks->sks_magic = SKS_MAGIC;
+ sks->sks_objs = skc->skc_slab_objs;
+ sks->sks_age = jiffies;
+ sks->sks_cache = skc;
+ INIT_LIST_HEAD(&sks->sks_list);
+ INIT_LIST_HEAD(&sks->sks_free_list);
+ sks->sks_ref = 0;
+ obj_size = spl_obj_size(skc);
+
+ if (skc->skc_flags & KMC_OFFSLAB)
+ offslab_size = spl_offslab_size(skc);
+
+ for (i = 0; i < sks->sks_objs; i++) {
+ if (skc->skc_flags & KMC_OFFSLAB) {
+ obj = kv_alloc(skc, offslab_size, flags);
+ if (!obj) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ } else {
+ obj = base + spl_sks_size(skc) + (i * obj_size);
+ }
+
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+ sko = spl_sko_from_obj(skc, obj);
+ sko->sko_addr = obj;
+ sko->sko_magic = SKO_MAGIC;
+ sko->sko_slab = sks;
+ INIT_LIST_HEAD(&sko->sko_list);
+ list_add_tail(&sko->sko_list, &sks->sks_free_list);
+ }
+
+out:
+ if (rc) {
+ if (skc->skc_flags & KMC_OFFSLAB)
+ list_for_each_entry_safe(sko,
+ n, &sks->sks_free_list, sko_list) {
+ kv_free(skc, sko->sko_addr, offslab_size);
+ }
+
+ kv_free(skc, base, skc->skc_slab_size);
+ sks = NULL;
+ }
+
+ return (sks);
+}
+
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
+ */
+static void
+spl_slab_free(spl_kmem_slab_t *sks,
+ struct list_head *sks_list, struct list_head *sko_list)
+{
+ spl_kmem_cache_t *skc;
+
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref == 0);
+
+ skc = sks->sks_cache;
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Update slab/objects counters in the cache, then remove the
+ * slab from the skc->skc_partial_list. Finally add the slab
+ * and all its objects in to the private work lists where the
+ * destructors will be called and the memory freed to the system.
+ */
+ skc->skc_obj_total -= sks->sks_objs;
+ skc->skc_slab_total--;
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, sks_list);
+ list_splice_init(&sks->sks_free_list, sko_list);
+}
+
+/*
+ * Reclaim empty slabs at the end of the partial list.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc)
+{
+ spl_kmem_slab_t *sks, *m;
+ spl_kmem_obj_t *sko, *n;
+ LIST_HEAD(sks_list);
+ LIST_HEAD(sko_list);
+ uint32_t size = 0;
+
+ /*
+ * Empty slabs and objects must be moved to a private list so they
+ * can be safely freed outside the spin lock. All empty slabs are
+ * at the end of skc->skc_partial_list, therefore once a non-empty
+ * slab is found we can stop scanning.
+ */
+ spin_lock(&skc->skc_lock);
+ list_for_each_entry_safe_reverse(sks, m,
+ &skc->skc_partial_list, sks_list) {
+
+ if (sks->sks_ref > 0)
+ break;
+
+ spl_slab_free(sks, &sks_list, &sko_list);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ /*
+ * The following two loops ensure all the object destructors are
+ * run, any offslab objects are freed, and the slabs themselves
+ * are freed. This is all done outside the skc->skc_lock since
+ * this allows the destructor to sleep, and allows us to perform
+ * a conditional reschedule when a freeing a large number of
+ * objects and slabs back to the system.
+ */
+ if (skc->skc_flags & KMC_OFFSLAB)
+ size = spl_offslab_size(skc);
+
+ list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+
+ if (skc->skc_flags & KMC_OFFSLAB)
+ kv_free(skc, sko->sko_addr, size);
+ }
+
+ list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ kv_free(skc, sks, skc->skc_slab_size);
+ }
+}
+
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+ struct rb_node *node = root->rb_node;
+ spl_kmem_emergency_t *ske;
+ unsigned long address = (unsigned long)obj;
+
+ while (node) {
+ ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+ if (address < ske->ske_obj)
+ node = node->rb_left;
+ else if (address > ske->ske_obj)
+ node = node->rb_right;
+ else
+ return (ske);
+ }
+
+ return (NULL);
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+ struct rb_node **new = &(root->rb_node), *parent = NULL;
+ spl_kmem_emergency_t *ske_tmp;
+ unsigned long address = ske->ske_obj;
+
+ while (*new) {
+ ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+ parent = *new;
+ if (address < ske_tmp->ske_obj)
+ new = &((*new)->rb_left);
+ else if (address > ske_tmp->ske_obj)
+ new = &((*new)->rb_right);
+ else
+ return (0);
+ }
+
+ rb_link_node(&ske->ske_node, parent, new);
+ rb_insert_color(&ske->ske_node, root);
+
+ return (1);
+}
+
+/*
+ * Allocate a single emergency object and track it in a red black tree.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+ int empty;
+
+ /* Last chance use a partial slab if one now exists */
+ spin_lock(&skc->skc_lock);
+ empty = list_empty(&skc->skc_partial_list);
+ spin_unlock(&skc->skc_lock);
+ if (!empty)
+ return (-EEXIST);
+
+ ske = kmalloc(sizeof (*ske), lflags);
+ if (ske == NULL)
+ return (-ENOMEM);
+
+ ske->ske_obj = __get_free_pages(lflags, order);
+ if (ske->ske_obj == 0) {
+ kfree(ske);
+ return (-ENOMEM);
+ }
+
+ spin_lock(&skc->skc_lock);
+ empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+ if (likely(empty)) {
+ skc->skc_obj_total++;
+ skc->skc_obj_emergency++;
+ if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+ skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (unlikely(!empty)) {
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+ return (-EINVAL);
+ }
+
+ *obj = (void *)ske->ske_obj;
+
+ return (0);
+}
+
+/*
+ * Locate the passed object in the red black tree and free it.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_emergency_t *ske;
+ int order = get_order(skc->skc_obj_size);
+
+ spin_lock(&skc->skc_lock);
+ ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+ if (ske) {
+ rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+ skc->skc_obj_emergency--;
+ skc->skc_obj_total--;
+ }
+ spin_unlock(&skc->skc_lock);
+
+ if (ske == NULL)
+ return (-ENOENT);
+
+ free_pages(ske->ske_obj, order);
+ kfree(ske);
+
+ return (0);
+}
+
+/*
+ * Release objects from the per-cpu magazine back to their slab. The flush
+ * argument contains the max number of entries to remove from the magazine.
+ */
+static void
+__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ int i, count = MIN(flush, skm->skm_avail);
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ for (i = 0; i < count; i++)
+ spl_cache_shrink(skc, skm->skm_objs[i]);
+
+ skm->skm_avail -= count;
+ memmove(skm->skm_objs, &(skm->skm_objs[count]),
+ sizeof (void *) * skm->skm_avail);
+}
+
+static void
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+ spin_lock(&skc->skc_lock);
+ __spl_cache_flush(skc, skm, flush);
+ spin_unlock(&skc->skc_lock);
+}
+
+static void
+spl_magazine_age(void *data)
+{
+ spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+ spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_cpu == smp_processor_id());
+ ASSERT(irqs_disabled());
+
+ /* There are no available objects or they are too young to age out */
+ if ((skm->skm_avail == 0) ||
+ time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
+ return;
+
+ /*
+ * Because we're executing in interrupt context we may have
+ * interrupted the holder of this lock. To avoid a potential
+ * deadlock return if the lock is contended.
+ */
+ if (!spin_trylock(&skc->skc_lock))
+ return;
+
+ __spl_cache_flush(skc, skm, skm->skm_refill);
+ spin_unlock(&skc->skc_lock);
+}
+
+/*
+ * Called regularly to keep a downward pressure on the cache.
+ *
+ * Objects older than skc->skc_delay seconds in the per-cpu magazines will
+ * be returned to the caches. This is done to prevent idle magazines from
+ * holding memory which could be better used elsewhere. The delay is
+ * present to prevent thrashing the magazine.
+ *
+ * The newly released objects may result in empty partial slabs. Those
+ * slabs should be released to the system. Otherwise moving the objects
+ * out of the magazines is just wasted work.
+ */
+static void
+spl_cache_age(void *data)
+{
+ spl_kmem_cache_t *skc = (spl_kmem_cache_t *)data;
+ taskqid_t id = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /* Dynamically disabled at run time */
+ if (!(spl_kmem_cache_expire & KMC_EXPIRE_AGE))
+ return;
+
+ atomic_inc(&skc->skc_ref);
+
+ if (!(skc->skc_flags & KMC_NOMAGAZINE))
+ on_each_cpu(spl_magazine_age, skc, 1);
+
+ spl_slab_reclaim(skc);
+
+ while (!test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && !id) {
+ id = taskq_dispatch_delay(
+ spl_kmem_cache_taskq, spl_cache_age, skc, TQ_SLEEP,
+ ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+
+ /* Destroy issued after dispatch immediately cancel it */
+ if (test_bit(KMC_BIT_DESTROY, &skc->skc_flags) && id)
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+ }
+
+ spin_lock(&skc->skc_lock);
+ skc->skc_taskqid = id;
+ spin_unlock(&skc->skc_lock);
+
+ atomic_dec(&skc->skc_ref);
+}
+
+/*
+ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
+ * When on-slab we want to target spl_kmem_cache_obj_per_slab. However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page. Also for
+ * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
+ * lower than this and we will fail.
+ */
+static int
+spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
+{
+ uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
+
+ if (skc->skc_flags & KMC_OFFSLAB) {
+ tgt_objs = spl_kmem_cache_obj_per_slab;
+ tgt_size = P2ROUNDUP(sizeof (spl_kmem_slab_t), PAGE_SIZE);
+
+ if ((skc->skc_flags & KMC_KMEM) &&
+ (spl_obj_size(skc) > (SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE)))
+ return (-ENOSPC);
+ } else {
+ sks_size = spl_sks_size(skc);
+ obj_size = spl_obj_size(skc);
+ max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+ tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
+
+ /*
+ * KMC_KMEM slabs are allocated by __get_free_pages() which
+ * rounds up to the nearest order. Knowing this the size
+ * should be rounded up to the next power of two with a hard
+ * maximum defined by the maximum allowed allocation order.
+ */
+ if (skc->skc_flags & KMC_KMEM) {
+ max_size = SPL_MAX_ORDER_NR_PAGES * PAGE_SIZE;
+ tgt_size = MIN(max_size,
+ PAGE_SIZE * (1 << MAX(get_order(tgt_size) - 1, 1)));
+ }
+
+ if (tgt_size <= max_size) {
+ tgt_objs = (tgt_size - sks_size) / obj_size;
+ } else {
+ tgt_objs = (max_size - sks_size) / obj_size;
+ tgt_size = (tgt_objs * obj_size) + sks_size;
+ }
+ }
+
+ if (tgt_objs == 0)
+ return (-ENOSPC);
+
+ *objs = tgt_objs;
+ *size = tgt_size;
+
+ return (0);
+}
+
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine. Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+ uint32_t obj_size = spl_obj_size(skc);
+ int size;
+
+ if (spl_kmem_cache_magazine_size > 0)
+ return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
+ /* Per-magazine sizes below assume a 4Kib page size */
+ if (obj_size > (PAGE_SIZE * 256))
+ size = 4; /* Minimum 4Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE * 32))
+ size = 16; /* Minimum 2Mib per-magazine */
+ else if (obj_size > (PAGE_SIZE))
+ size = 64; /* Minimum 256Kib per-magazine */
+ else if (obj_size > (PAGE_SIZE / 4))
+ size = 128; /* Minimum 128Kib per-magazine */
+ else
+ size = 256;
+
+ return (size);
+}
+
+/*
+ * Allocate a per-cpu magazine to associate with a specific core.
+ */
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
+{
+ spl_kmem_magazine_t *skm;
+ int size = sizeof (spl_kmem_magazine_t) +
+ sizeof (void *) * skc->skc_mag_size;
+
+ skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (skm) {
+ skm->skm_magic = SKM_MAGIC;
+ skm->skm_avail = 0;
+ skm->skm_size = skc->skc_mag_size;
+ skm->skm_refill = skc->skc_mag_refill;
+ skm->skm_cache = skc;
+ skm->skm_age = jiffies;
+ skm->skm_cpu = cpu;
+ }
+
+ return (skm);
+}
+
+/*
+ * Free a per-cpu magazine associated with a specific core.
+ */
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+ ASSERT(skm->skm_avail == 0);
+ kfree(skm);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+ int i;
+
+ if (skc->skc_flags & KMC_NOMAGAZINE)
+ return (0);
+
+ skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+ num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
+ skc->skc_mag_size = spl_magazine_size(skc);
+ skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+
+ for_each_possible_cpu(i) {
+ skc->skc_mag[i] = spl_magazine_alloc(skc, i);
+ if (!skc->skc_mag[i]) {
+ for (i--; i >= 0; i--)
+ spl_magazine_free(skc->skc_mag[i]);
+
+ kfree(skc->skc_mag);
+ return (-ENOMEM);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+ spl_kmem_magazine_t *skm;
+ int i;
+
+ if (skc->skc_flags & KMC_NOMAGAZINE)
+ return;
+
+ for_each_possible_cpu(i) {
+ skm = skc->skc_mag[i];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ spl_magazine_free(skm);
+ }
+
+ kfree(skc->skc_mag);
+}
+
+/*
+ * Create a object cache based on the following arguments:
+ * name cache name
+ * size cache object size
+ * align cache object alignment
+ * ctor cache object constructor
+ * dtor cache object destructor
+ * reclaim cache object reclaim
+ * priv cache private data for ctor/dtor/reclaim
+ * vmp unused must be NULL
+ * flags
+ * KMC_KMEM Force SPL kmem backed cache
+ * KMC_VMEM Force SPL vmem backed cache
+ * KMC_SLAB Force Linux slab backed cache
+ * KMC_OFFSLAB Locate objects off the slab
+ * KMC_NOTOUCH unsupported
+ * KMC_NODEBUG unsupported
+ * KMC_NOHASH unsupported
+ * KMC_QCACHE unsupported
+ * KMC_NOMAGAZINE unsupported
+ */
+spl_kmem_cache_t *
+spl_kmem_cache_create(char *name, size_t size, size_t align,
+ spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, spl_kmem_reclaim_t reclaim,
+ void *priv, void *vmp, int flags)
+{
+ gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+ spl_kmem_cache_t *skc;
+ int rc;
+
+ /*
+ * Unsupported flags
+ */
+ ASSERT0(flags & KMC_NOMAGAZINE);
+ ASSERT0(flags & KMC_NOHASH);
+ ASSERT0(flags & KMC_QCACHE);
+ ASSERT(vmp == NULL);
+
+ might_sleep();
+
+ skc = kzalloc(sizeof (*skc), lflags);
+ if (skc == NULL)
+ return (NULL);
+
+ skc->skc_magic = SKC_MAGIC;
+ skc->skc_name_size = strlen(name) + 1;
+ skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+ if (skc->skc_name == NULL) {
+ kfree(skc);
+ return (NULL);
+ }
+ strncpy(skc->skc_name, name, skc->skc_name_size);
+
+ skc->skc_ctor = ctor;
+ skc->skc_dtor = dtor;
+ skc->skc_reclaim = reclaim;
+ skc->skc_private = priv;
+ skc->skc_vmp = vmp;
+ skc->skc_linux_cache = NULL;
+ skc->skc_flags = flags;
+ skc->skc_obj_size = size;
+ skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
+ skc->skc_delay = SPL_KMEM_CACHE_DELAY;
+ skc->skc_reap = SPL_KMEM_CACHE_REAP;
+ atomic_set(&skc->skc_ref, 0);
+
+ INIT_LIST_HEAD(&skc->skc_list);
+ INIT_LIST_HEAD(&skc->skc_complete_list);
+ INIT_LIST_HEAD(&skc->skc_partial_list);
+ skc->skc_emergency_tree = RB_ROOT;
+ spin_lock_init(&skc->skc_lock);
+ init_waitqueue_head(&skc->skc_waitq);
+ skc->skc_slab_fail = 0;
+ skc->skc_slab_create = 0;
+ skc->skc_slab_destroy = 0;
+ skc->skc_slab_total = 0;
+ skc->skc_slab_alloc = 0;
+ skc->skc_slab_max = 0;
+ skc->skc_obj_total = 0;
+ skc->skc_obj_alloc = 0;
+ skc->skc_obj_max = 0;
+ skc->skc_obj_deadlock = 0;
+ skc->skc_obj_emergency = 0;
+ skc->skc_obj_emergency_max = 0;
+
+ /*
+ * Verify the requested alignment restriction is sane.
+ */
+ if (align) {
+ VERIFY(ISP2(align));
+ VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
+ VERIFY3U(align, <=, PAGE_SIZE);
+ skc->skc_obj_align = align;
+ }
+
+ /*
+ * When no specific type of slab is requested (kmem, vmem, or
+ * linuxslab) then select a cache type based on the object size
+ * and default tunables.
+ */
+ if (!(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB))) {
+
+ if (spl_kmem_cache_slab_limit &&
+ size <= (size_t)spl_kmem_cache_slab_limit) {
+ /*
+ * Objects smaller than spl_kmem_cache_slab_limit can
+ * use the Linux slab for better space-efficiency.
+ */
+ skc->skc_flags |= KMC_SLAB;
+ } else if (spl_obj_size(skc) <= spl_kmem_cache_kmem_limit) {
+ /*
+ * Small objects, less than spl_kmem_cache_kmem_limit
+ * per object should use kmem because their slabs are
+ * small.
+ */
+ skc->skc_flags |= KMC_KMEM;
+ } else {
+ /*
+ * All other objects are considered large and are
+ * placed on vmem backed slabs.
+ */
+ skc->skc_flags |= KMC_VMEM;
+ }
+ }
+
+ /*
+ * Given the type of slab allocate the required resources.
+ */
+ if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
+ rc = spl_slab_size(skc,
+ &skc->skc_slab_objs, &skc->skc_slab_size);
+ if (rc)
+ goto out;
+
+ rc = spl_magazine_create(skc);
+ if (rc)
+ goto out;
+ } else {
+ unsigned long slabflags = 0;
+
+ if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+ rc = EINVAL;
+ goto out;
+ }
+
+#if defined(SLAB_USERCOPY)
+ /*
+ * Required for PAX-enabled kernels if the slab is to be
+ * used for copying between user and kernel space.
+ */
+ slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+ /*
+ * Newer grsec patchset uses kmem_cache_create_usercopy()
+ * instead of SLAB_USERCOPY flag
+ */
+ skc->skc_linux_cache = kmem_cache_create_usercopy(
+ skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+ skc->skc_linux_cache = kmem_cache_create(
+ skc->skc_name, size, align, slabflags, NULL);
+#endif
+ if (skc->skc_linux_cache == NULL) {
+ rc = ENOMEM;
+ goto out;
+ }
+
+#if defined(HAVE_KMEM_CACHE_ALLOCFLAGS)
+ skc->skc_linux_cache->allocflags |= __GFP_COMP;
+#elif defined(HAVE_KMEM_CACHE_GFPFLAGS)
+ skc->skc_linux_cache->gfpflags |= __GFP_COMP;
+#endif
+ skc->skc_flags |= KMC_NOMAGAZINE;
+ }
+
+ if (spl_kmem_cache_expire & KMC_EXPIRE_AGE) {
+ skc->skc_taskqid = taskq_dispatch_delay(spl_kmem_cache_taskq,
+ spl_cache_age, skc, TQ_SLEEP,
+ ddi_get_lbolt() + skc->skc_delay / 3 * HZ);
+ }
+
+ down_write(&spl_kmem_cache_sem);
+ list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
+ up_write(&spl_kmem_cache_sem);
+
+ return (skc);
+out:
+ kfree(skc->skc_name);
+ kfree(skc);
+ return (NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_create);
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
+ kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+ ASSERT(move != NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_set_move);
+
+/*
+ * Destroy a cache and all objects associated with the cache.
+ */
+void
+spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
+{
+ DECLARE_WAIT_QUEUE_HEAD(wq);
+ taskqid_t id;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skc->skc_flags & (KMC_KMEM | KMC_VMEM | KMC_SLAB));
+
+ down_write(&spl_kmem_cache_sem);
+ list_del_init(&skc->skc_list);
+ up_write(&spl_kmem_cache_sem);
+
+ /* Cancel any and wait for any pending delayed tasks */
+ VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ spin_lock(&skc->skc_lock);
+ id = skc->skc_taskqid;
+ spin_unlock(&skc->skc_lock);
+
+ taskq_cancel_id(spl_kmem_cache_taskq, id);
+
+ /*
+ * Wait until all current callers complete, this is mainly
+ * to catch the case where a low memory situation triggers a
+ * cache reaping action which races with this destroy.
+ */
+ wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
+ if (skc->skc_flags & (KMC_KMEM | KMC_VMEM)) {
+ spl_magazine_destroy(skc);
+ spl_slab_reclaim(skc);
+ } else {
+ ASSERT(skc->skc_flags & KMC_SLAB);
+ kmem_cache_destroy(skc->skc_linux_cache);
+ }
+
+ spin_lock(&skc->skc_lock);
+
+ /*
+ * Validate there are no objects in use and free all the
+ * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+ */
+ ASSERT3U(skc->skc_slab_alloc, ==, 0);
+ ASSERT3U(skc->skc_obj_alloc, ==, 0);
+ ASSERT3U(skc->skc_slab_total, ==, 0);
+ ASSERT3U(skc->skc_obj_total, ==, 0);
+ ASSERT3U(skc->skc_obj_emergency, ==, 0);
+ ASSERT(list_empty(&skc->skc_complete_list));
+
+ spin_unlock(&skc->skc_lock);
+
+ kfree(skc->skc_name);
+ kfree(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_destroy);
+
+/*
+ * Allocate an object from a slab attached to the cache. This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+ spl_kmem_obj_t *sko;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+
+ sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ ASSERT(sko->sko_addr != NULL);
+
+ /* Remove from sks_free_list */
+ list_del_init(&sko->sko_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref++;
+ skc->skc_obj_alloc++;
+
+ /* Track max obj usage statistics */
+ if (skc->skc_obj_alloc > skc->skc_obj_max)
+ skc->skc_obj_max = skc->skc_obj_alloc;
+
+ /* Track max slab usage statistics */
+ if (sks->sks_ref == 1) {
+ skc->skc_slab_alloc++;
+
+ if (skc->skc_slab_alloc > skc->skc_slab_max)
+ skc->skc_slab_max = skc->skc_slab_alloc;
+ }
+
+ return (sko->sko_addr);
+}
+
+/*
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
+ */
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_slab_t *sks;
+
+ fstrans_cookie_t cookie = spl_fstrans_mark();
+ sks = spl_slab_alloc(skc, flags);
+ spl_fstrans_unmark(cookie);
+
+ spin_lock(&skc->skc_lock);
+ if (sks) {
+ skc->skc_slab_total++;
+ skc->skc_obj_total += sks->sks_objs;
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ }
+ spin_unlock(&skc->skc_lock);
+
+ return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+ spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+ spl_kmem_cache_t *skc = ska->ska_cache;
+
+ (void) __spl_cache_grow(skc, ska->ska_flags);
+
+ atomic_dec(&skc->skc_ref);
+ smp_mb__before_atomic();
+ clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+
+ kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+ return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
+}
+
+/*
+ * No available objects on any slabs, create a new slab. Note that this
+ * functionality is disabled for KMC_SLAB caches which are backed by the
+ * Linux slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+ int remaining, rc = 0;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+ might_sleep();
+ *obj = NULL;
+
+ /*
+ * Before allocating a new slab wait for any reaping to complete and
+ * then return so the local magazine can be rechecked for new objects.
+ */
+ if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+ rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+ TASK_UNINTERRUPTIBLE);
+ return (rc ? rc : -EAGAIN);
+ }
+
+ /*
+ * To reduce the overhead of context switch and improve NUMA locality,
+ * it tries to allocate a new slab in the current process context with
+ * KM_NOSLEEP flag. If it fails, it will launch a new taskq to do the
+ * allocation.
+ *
+ * However, this can't be applied to KVM_VMEM due to a bug that
+ * __vmalloc() doesn't honor gfp flags in page table allocation.
+ */
+ if (!(skc->skc_flags & KMC_VMEM)) {
+ rc = __spl_cache_grow(skc, flags | KM_NOSLEEP);
+ if (rc == 0)
+ return (0);
+ }
+
+ /*
+ * This is handled by dispatching a work request to the global work
+ * queue. This allows us to asynchronously allocate a new slab while
+ * retaining the ability to safely fall back to a smaller synchronous
+ * allocations to ensure forward progress is always maintained.
+ */
+ if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+ spl_kmem_alloc_t *ska;
+
+ ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
+ if (ska == NULL) {
+ clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_all(&skc->skc_waitq);
+ return (-ENOMEM);
+ }
+
+ atomic_inc(&skc->skc_ref);
+ ska->ska_cache = skc;
+ ska->ska_flags = flags;
+ taskq_init_ent(&ska->ska_tqe);
+ taskq_dispatch_ent(spl_kmem_cache_taskq,
+ spl_cache_grow_work, ska, 0, &ska->ska_tqe);
+ }
+
+ /*
+ * The goal here is to only detect the rare case where a virtual slab
+ * allocation has deadlocked. We must be careful to minimize the use
+ * of emergency objects which are more expensive to track. Therefore,
+ * we set a very long timeout for the asynchronous allocation and if
+ * the timeout is reached the cache is flagged as deadlocked. From
+ * this point only new emergency objects will be allocated until the
+ * asynchronous allocation completes and clears the deadlocked flag.
+ */
+ if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+ rc = spl_emergency_alloc(skc, flags, obj);
+ } else {
+ remaining = wait_event_timeout(skc->skc_waitq,
+ spl_cache_grow_wait(skc), HZ / 10);
+
+ if (!remaining) {
+ spin_lock(&skc->skc_lock);
+ if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+ set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+ skc->skc_obj_deadlock++;
+ }
+ spin_unlock(&skc->skc_lock);
+ }
+
+ rc = -ENOMEM;
+ }
+
+ return (rc);
+}
+
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created. On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
+ */
+static void *
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+ spl_kmem_slab_t *sks;
+ int count = 0, rc, refill;
+ void *obj = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
+ spin_lock(&skc->skc_lock);
+
+ while (refill > 0) {
+ /* No slabs available we may need to grow the cache */
+ if (list_empty(&skc->skc_partial_list)) {
+ spin_unlock(&skc->skc_lock);
+
+ local_irq_enable();
+ rc = spl_cache_grow(skc, flags, &obj);
+ local_irq_disable();
+
+ /* Emergency object for immediate use by caller */
+ if (rc == 0 && obj != NULL)
+ return (obj);
+
+ if (rc)
+ goto out;
+
+ /* Rescheduled to different CPU skm is not local */
+ if (skm != skc->skc_mag[smp_processor_id()])
+ goto out;
+
+ /*
+ * Potentially rescheduled to the same CPU but
+ * allocations may have occurred from this CPU while
+ * we were sleeping so recalculate max refill.
+ */
+ refill = MIN(refill, skm->skm_size - skm->skm_avail);
+
+ spin_lock(&skc->skc_lock);
+ continue;
+ }
+
+ /* Grab the next available slab */
+ sks = list_entry((&skc->skc_partial_list)->next,
+ spl_kmem_slab_t, sks_list);
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_ref < sks->sks_objs);
+ ASSERT(!list_empty(&sks->sks_free_list));
+
+ /*
+ * Consume as many objects as needed to refill the requested
+ * cache. We must also be careful not to overfill it.
+ */
+ while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+ ++count) {
+ ASSERT(skm->skm_avail < skm->skm_size);
+ ASSERT(count < skm->skm_size);
+ skm->skm_objs[skm->skm_avail++] =
+ spl_cache_obj(skc, sks);
+ }
+
+ /* Move slab to skc_complete_list when full */
+ if (sks->sks_ref == sks->sks_objs) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_complete_list);
+ }
+ }
+
+ spin_unlock(&skc->skc_lock);
+out:
+ return (NULL);
+}
+
+/*
+ * Release an object back to the slab from which it came.
+ */
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_slab_t *sks = NULL;
+ spl_kmem_obj_t *sko = NULL;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ sko = spl_sko_from_obj(skc, obj);
+ ASSERT(sko->sko_magic == SKO_MAGIC);
+ sks = sko->sko_slab;
+ ASSERT(sks->sks_magic == SKS_MAGIC);
+ ASSERT(sks->sks_cache == skc);
+ list_add(&sko->sko_list, &sks->sks_free_list);
+
+ sks->sks_age = jiffies;
+ sks->sks_ref--;
+ skc->skc_obj_alloc--;
+
+ /*
+ * Move slab to skc_partial_list when no longer full. Slabs
+ * are added to the head to keep the partial list is quasi-full
+ * sorted order. Fuller at the head, emptier at the tail.
+ */
+ if (sks->sks_ref == (sks->sks_objs - 1)) {
+ list_del(&sks->sks_list);
+ list_add(&sks->sks_list, &skc->skc_partial_list);
+ }
+
+ /*
+ * Move empty slabs to the end of the partial list so
+ * they can be easily found and freed during reclamation.
+ */
+ if (sks->sks_ref == 0) {
+ list_del(&sks->sks_list);
+ list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+ skc->skc_slab_alloc--;
+ }
+}
+
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+ spl_kmem_magazine_t *skm;
+ void *obj = NULL;
+
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Allocate directly from a Linux slab. All optimizations are left
+ * to the underlying cache we only need to guarantee that KM_SLEEP
+ * callers will never fail.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ struct kmem_cache *slc = skc->skc_linux_cache;
+ do {
+ obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
+ } while ((obj == NULL) && !(flags & KM_NOSLEEP));
+
+ goto ret;
+ }
+
+ local_irq_disable();
+
+restart:
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * in the restart case we must be careful to reacquire
+ * the local magazine since this may have changed
+ * when we need to grow the cache.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ if (likely(skm->skm_avail)) {
+ /* Object available in CPU cache, use it */
+ obj = skm->skm_objs[--skm->skm_avail];
+ skm->skm_age = jiffies;
+ } else {
+ obj = spl_cache_refill(skc, skm, flags);
+ if ((obj == NULL) && !(flags & KM_NOSLEEP))
+ goto restart;
+
+ local_irq_enable();
+ goto ret;
+ }
+
+ local_irq_enable();
+ ASSERT(obj);
+ ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+
+ret:
+ /* Pre-emptively migrate object to CPU L1 cache */
+ if (obj) {
+ if (obj && skc->skc_ctor)
+ skc->skc_ctor(obj, skc->skc_private, flags);
+ else
+ prefetchw(obj);
+ }
+
+ return (obj);
+}
+EXPORT_SYMBOL(spl_kmem_cache_alloc);
+
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from. We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
+void
+spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
+{
+ spl_kmem_magazine_t *skm;
+ unsigned long flags;
+ int do_reclaim = 0;
+ int do_emergency = 0;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ /*
+ * Run the destructor
+ */
+ if (skc->skc_dtor)
+ skc->skc_dtor(obj, skc->skc_private);
+
+ /*
+ * Free the object from the Linux underlying Linux slab.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ kmem_cache_free(skc->skc_linux_cache, obj);
+ return;
+ }
+
+ /*
+ * While a cache has outstanding emergency objects all freed objects
+ * must be checked. However, since emergency objects will never use
+ * a virtual address these objects can be safely excluded as an
+ * optimization.
+ */
+ if (!is_vmalloc_addr(obj)) {
+ spin_lock(&skc->skc_lock);
+ do_emergency = (skc->skc_obj_emergency > 0);
+ spin_unlock(&skc->skc_lock);
+
+ if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+ return;
+ }
+
+ local_irq_save(flags);
+
+ /*
+ * Safe to update per-cpu structure without lock, but
+ * no remote memory allocation tracking is being performed
+ * it is entirely possible to allocate an object from one
+ * CPU cache and return it to another.
+ */
+ skm = skc->skc_mag[smp_processor_id()];
+ ASSERT(skm->skm_magic == SKM_MAGIC);
+
+ /*
+ * Per-CPU cache full, flush it to make space for this object,
+ * this may result in an empty slab which can be reclaimed once
+ * interrupts are re-enabled.
+ */
+ if (unlikely(skm->skm_avail >= skm->skm_size)) {
+ spl_cache_flush(skc, skm, skm->skm_refill);
+ do_reclaim = 1;
+ }
+
+ /* Available space in cache, use it */
+ skm->skm_objs[skm->skm_avail++] = obj;
+
+ local_irq_restore(flags);
+
+ if (do_reclaim)
+ spl_slab_reclaim(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_free);
+
+/*
+ * The generic shrinker function for all caches. Under Linux a shrinker
+ * may not be tightly coupled with a slab cache. In fact Linux always
+ * systematically tries calling all registered shrinker callbacks which
+ * report that they contain unused objects. Because of this we only
+ * register one shrinker function in the shim layer for all slab caches.
+ * We always attempt to shrink all caches when this generic shrinker
+ * is called.
+ *
+ * If sc->nr_to_scan is zero, the caller is requesting a query of the
+ * number of objects which can potentially be freed. If it is nonzero,
+ * the request is to free that many objects.
+ *
+ * Linux kernels >= 3.12 have the count_objects and scan_objects callbacks
+ * in struct shrinker and also require the shrinker to return the number
+ * of objects freed.
+ *
+ * Older kernels require the shrinker to return the number of freeable
+ * objects following the freeing of nr_to_free.
+ *
+ * Linux semantics differ from those under Solaris, which are to
+ * free all available objects which may (and probably will) be more
+ * objects than the requested nr_to_scan.
+ */
+static spl_shrinker_t
+__spl_kmem_cache_generic_shrinker(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ spl_kmem_cache_t *skc;
+ int alloc = 0;
+
+ /*
+ * No shrinking in a transaction context. Can cause deadlocks.
+ */
+ if (sc->nr_to_scan && spl_fstrans_check())
+ return (SHRINK_STOP);
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ if (sc->nr_to_scan) {
+#ifdef HAVE_SPLIT_SHRINKER_CALLBACK
+ uint64_t oldalloc = skc->skc_obj_alloc;
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
+ if (oldalloc > skc->skc_obj_alloc)
+ alloc += oldalloc - skc->skc_obj_alloc;
+#else
+ spl_kmem_cache_reap_now(skc,
+ MAX(sc->nr_to_scan>>fls64(skc->skc_slab_objs), 1));
+ alloc += skc->skc_obj_alloc;
+#endif /* HAVE_SPLIT_SHRINKER_CALLBACK */
+ } else {
+ /* Request to query number of freeable objects */
+ alloc += skc->skc_obj_alloc;
+ }
+ }
+ up_read(&spl_kmem_cache_sem);
+
+ /*
+ * When KMC_RECLAIM_ONCE is set allow only a single reclaim pass.
+ * This functionality only exists to work around a rare issue where
+ * shrink_slabs() is repeatedly invoked by many cores causing the
+ * system to thrash.
+ */
+ if ((spl_kmem_cache_reclaim & KMC_RECLAIM_ONCE) && sc->nr_to_scan)
+ return (SHRINK_STOP);
+
+ return (MAX(alloc, 0));
+}
+
+SPL_SHRINKER_CALLBACK_WRAPPER(spl_kmem_cache_generic_shrinker);
+
+/*
+ * Call the registered reclaim function for a cache. Depending on how
+ * many and which objects are released it may simply repopulate the
+ * local magazine which will then need to age-out. Objects which cannot
+ * fit in the magazine we will be released back to their slabs which will
+ * also need to age out before being release. This is all just best
+ * effort and we do not want to thrash creating and destroying slabs.
+ */
+void
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc, int count)
+{
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+ ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+ atomic_inc(&skc->skc_ref);
+
+ /*
+ * Execute the registered reclaim callback if it exists.
+ */
+ if (skc->skc_flags & KMC_SLAB) {
+ if (skc->skc_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+ goto out;
+ }
+
+ /*
+ * Prevent concurrent cache reaping when contended.
+ */
+ if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
+ goto out;
+
+ /*
+ * When a reclaim function is available it may be invoked repeatedly
+ * until at least a single slab can be freed. This ensures that we
+ * do free memory back to the system. This helps minimize the chance
+ * of an OOM event when the bulk of memory is used by the slab.
+ *
+ * When free slabs are already available the reclaim callback will be
+ * skipped. Additionally, if no forward progress is detected despite
+ * a reclaim function the cache will be skipped to avoid deadlock.
+ *
+ * Longer term this would be the correct place to add the code which
+ * repacks the slabs in order minimize fragmentation.
+ */
+ if (skc->skc_reclaim) {
+ uint64_t objects = UINT64_MAX;
+ int do_reclaim;
+
+ do {
+ spin_lock(&skc->skc_lock);
+ do_reclaim =
+ (skc->skc_slab_total > 0) &&
+ ((skc->skc_slab_total-skc->skc_slab_alloc) == 0) &&
+ (skc->skc_obj_alloc < objects);
+
+ objects = skc->skc_obj_alloc;
+ spin_unlock(&skc->skc_lock);
+
+ if (do_reclaim)
+ skc->skc_reclaim(skc->skc_private);
+
+ } while (do_reclaim);
+ }
+
+ /* Reclaim from the magazine and free all now empty slabs. */
+ if (spl_kmem_cache_expire & KMC_EXPIRE_MEM) {
+ spl_kmem_magazine_t *skm;
+ unsigned long irq_flags;
+
+ local_irq_save(irq_flags);
+ skm = skc->skc_mag[smp_processor_id()];
+ spl_cache_flush(skc, skm, skm->skm_avail);
+ local_irq_restore(irq_flags);
+ }
+
+ spl_slab_reclaim(skc);
+ clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+out:
+ atomic_dec(&skc->skc_ref);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_now);
+
+/*
+ * This is stubbed out for code consistency with other platforms. There
+ * is existing logic to prevent concurrent reaping so while this is ugly
+ * it should do no harm.
+ */
+int
+spl_kmem_cache_reap_active()
+{
+ return (0);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_active);
+
+/*
+ * Reap all free slabs from all registered caches.
+ */
+void
+spl_kmem_reap(void)
+{
+ struct shrink_control sc;
+
+ sc.nr_to_scan = KMC_REAP_CHUNK;
+ sc.gfp_mask = GFP_KERNEL;
+
+ (void) __spl_kmem_cache_generic_shrinker(NULL, &sc);
+}
+EXPORT_SYMBOL(spl_kmem_reap);
+
+int
+spl_kmem_cache_init(void)
+{
+ init_rwsem(&spl_kmem_cache_sem);
+ INIT_LIST_HEAD(&spl_kmem_cache_list);
+ spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+ spl_kmem_cache_kmem_threads, maxclsyspri,
+ spl_kmem_cache_kmem_threads * 8, INT_MAX,
+ TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+ spl_register_shrinker(&spl_kmem_cache_shrinker);
+
+ return (0);
+}
+
+void
+spl_kmem_cache_fini(void)
+{
+ spl_unregister_shrinker(&spl_kmem_cache_shrinker);
+ taskq_destroy(spl_kmem_cache_taskq);
+}
diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c
new file mode 100644
index 000000000..824b5e89f
--- /dev/null
+++ b/module/os/linux/spl/spl-kmem.c
@@ -0,0 +1,556 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+#include <linux/mm.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous. Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to sixteen pages but capped at 64K to
+ * accommodate systems using large pages. This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern. Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught. These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+ "Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages. Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set. Kmem_alloc() allocations larger than this maximum
+ * will quickly fail. Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+ "Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
+/* END CSTYLED */
+
+int
+kmem_debugging(void)
+{
+ return (0);
+}
+EXPORT_SYMBOL(kmem_debugging);
+
+char *
+kmem_vasprintf(const char *fmt, va_list ap)
+{
+ va_list aq;
+ char *ptr;
+
+ do {
+ va_copy(aq, ap);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
+ va_end(aq);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_vasprintf);
+
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+ va_list ap;
+ char *ptr;
+
+ do {
+ va_start(ap, fmt);
+ ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
+ va_end(ap);
+ } while (ptr == NULL);
+
+ return (ptr);
+}
+EXPORT_SYMBOL(kmem_asprintf);
+
+static char *
+__strdup(const char *str, int flags)
+{
+ char *ptr;
+ int n;
+
+ n = strlen(str);
+ ptr = kmalloc(n + 1, kmem_flags_convert(flags));
+ if (ptr)
+ memcpy(ptr, str, n + 1);
+
+ return (ptr);
+}
+
+char *
+strdup(const char *str)
+{
+ return (__strdup(str, KM_SLEEP));
+}
+EXPORT_SYMBOL(strdup);
+
+void
+strfree(char *str)
+{
+ kfree(str);
+}
+EXPORT_SYMBOL(strfree);
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable. Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+ gfp_t lflags = kmem_flags_convert(flags);
+ int use_vmem = 0;
+ void *ptr;
+
+ /*
+ * Log abnormally large allocations and rate limit the console output.
+ * Allocations larger than spl_kmem_alloc_warn should be performed
+ * through the vmem_alloc()/vmem_zalloc() interfaces.
+ */
+ if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+ !(flags & KM_VMEM)) {
+ printk(KERN_WARNING
+ "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+ "https://github.com/zfsonlinux/zfs/issues/new\n",
+ (unsigned long)size, flags);
+ dump_stack();
+ }
+
+ /*
+ * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+ * unlike kmem_alloc() with KM_SLEEP on Illumos.
+ */
+ do {
+ /*
+ * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+ * is unsafe. This must fail for all for kmem_alloc() and
+ * kmem_zalloc() callers.
+ *
+ * For vmem_alloc() and vmem_zalloc() callers it is permissible
+ * to use __vmalloc(). However, in general use of __vmalloc()
+ * is strongly discouraged because a global lock must be
+ * acquired. Contention on this lock can significantly
+ * impact performance so frequently manipulating the virtual
+ * address space is strongly discouraged.
+ */
+ if ((size > spl_kmem_alloc_max) || use_vmem) {
+ if (flags & KM_VMEM) {
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM,
+ PAGE_KERNEL);
+ } else {
+ return (NULL);
+ }
+ } else {
+ ptr = kmalloc_node(size, lflags, node);
+ }
+
+ if (likely(ptr) || (flags & KM_NOSLEEP))
+ return (ptr);
+
+ /*
+ * For vmem_alloc() and vmem_zalloc() callers retry immediately
+ * using __vmalloc() which is unlikely to fail.
+ */
+ if ((flags & KM_VMEM) && (use_vmem == 0)) {
+ use_vmem = 1;
+ continue;
+ }
+
+ /*
+ * Use cond_resched() instead of congestion_wait() to avoid
+ * deadlocking systems where there are no block devices.
+ */
+ cond_resched();
+ } while (1);
+
+ return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+ if (is_vmalloc_addr(buf))
+ vfree(buf);
+ else
+ kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations. When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
+ */
+#ifdef DEBUG_KMEM
+
+/* Shim layer memory accounting */
+#ifdef HAVE_ATOMIC64_T
+atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#else /* HAVE_ATOMIC64_T */
+atomic_t kmem_alloc_used = ATOMIC_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#endif /* HAVE_ATOMIC64_T */
+
+EXPORT_SYMBOL(kmem_alloc_used);
+EXPORT_SYMBOL(kmem_alloc_max);
+
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+ void *ptr;
+
+ ptr = spl_kmem_alloc_impl(size, flags, node);
+ if (ptr) {
+ kmem_alloc_used_add(size);
+ if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+ kmem_alloc_max = kmem_alloc_used_read();
+ }
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+ kmem_alloc_used_sub(size);
+ spl_kmem_free_impl(ptr, size);
+}
+
+/*
+ * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
+ * but also the location of every alloc and free. When the SPL module is
+ * unloaded a list of all leaked addresses and where they were allocated
+ * will be dumped to the console. Enabling this feature has a significant
+ * impact on performance but it makes finding memory leaks straight forward.
+ *
+ * Not surprisingly with debugging enabled the xmem_locks are very highly
+ * contended particularly on xfree(). If we want to run with this detailed
+ * debugging enabled for anything other than debugging we need to minimize
+ * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
+ */
+#ifdef DEBUG_KMEM_TRACKING
+
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
+#define KMEM_HASH_BITS 10
+#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS)
+
+typedef struct kmem_debug {
+ struct hlist_node kd_hlist; /* Hash node linkage */
+ struct list_head kd_list; /* List of all allocations */
+ void *kd_addr; /* Allocation pointer */
+ size_t kd_size; /* Allocation size */
+ const char *kd_func; /* Allocation function */
+ int kd_line; /* Allocation line */
+} kmem_debug_t;
+
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
+
+static kmem_debug_t *
+kmem_del_init(spinlock_t *lock, struct hlist_head *table,
+ int bits, const void *addr)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kmem_debug *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(lock, flags);
+
+ head = &table[hash_ptr((void *)addr, bits)];
+ hlist_for_each(node, head) {
+ p = list_entry(node, struct kmem_debug, kd_hlist);
+ if (p->kd_addr == addr) {
+ hlist_del_init(&p->kd_hlist);
+ list_del_init(&p->kd_list);
+ spin_unlock_irqrestore(lock, flags);
+ return (p);
+ }
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+
+ return (NULL);
+}
+
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+ const char *func, int line, int node)
+{
+ void *ptr = NULL;
+ kmem_debug_t *dptr;
+ unsigned long irq_flags;
+
+ dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+ if (dptr == NULL)
+ return (NULL);
+
+ dptr->kd_func = __strdup(func, flags);
+ if (dptr->kd_func == NULL) {
+ kfree(dptr);
+ return (NULL);
+ }
+
+ ptr = spl_kmem_alloc_debug(size, flags, node);
+ if (ptr == NULL) {
+ kfree(dptr->kd_func);
+ kfree(dptr);
+ return (NULL);
+ }
+
+ INIT_HLIST_NODE(&dptr->kd_hlist);
+ INIT_LIST_HEAD(&dptr->kd_list);
+
+ dptr->kd_addr = ptr;
+ dptr->kd_size = size;
+ dptr->kd_line = line;
+
+ spin_lock_irqsave(&kmem_lock, irq_flags);
+ hlist_add_head(&dptr->kd_hlist,
+ &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+ list_add_tail(&dptr->kd_list, &kmem_list);
+ spin_unlock_irqrestore(&kmem_lock, irq_flags);
+
+ return (ptr);
+}
+
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
+{
+ kmem_debug_t *dptr;
+
+ /* Ignore NULL pointer since we haven't tracked it at all */
+ if (ptr == NULL)
+ return;
+
+ /* Must exist in hash due to kmem_alloc() */
+ dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
+ ASSERT3P(dptr, !=, NULL);
+ ASSERT3S(dptr->kd_size, ==, size);
+
+ kfree(dptr->kd_func);
+ kfree(dptr);
+
+ spl_kmem_free_debug(ptr, size);
+}
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
+void *
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
+
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_ZERO;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_zalloc);
+
+void
+spl_kmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_free);
+
+#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
+static char *
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+{
+ int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
+ int i, flag = 1;
+
+ ASSERT(str != NULL && len >= 17);
+ memset(str, 0, len);
+
+ /*
+ * Check for a fully printable string, and while we are at
+ * it place the printable characters in the passed buffer.
+ */
+ for (i = 0; i < size; i++) {
+ str[i] = ((char *)(kd->kd_addr))[i];
+ if (isprint(str[i])) {
+ continue;
+ } else {
+ /*
+ * Minimum number of printable characters found
+ * to make it worthwhile to print this as ascii.
+ */
+ if (i > min)
+ break;
+
+ flag = 0;
+ break;
+ }
+ }
+
+ if (!flag) {
+ sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
+ *((uint8_t *)kd->kd_addr),
+ *((uint8_t *)kd->kd_addr + 2),
+ *((uint8_t *)kd->kd_addr + 4),
+ *((uint8_t *)kd->kd_addr + 6),
+ *((uint8_t *)kd->kd_addr + 8),
+ *((uint8_t *)kd->kd_addr + 10),
+ *((uint8_t *)kd->kd_addr + 12),
+ *((uint8_t *)kd->kd_addr + 14));
+ }
+
+ return (str);
+}
+
+static int
+spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
+{
+ int i;
+
+ spin_lock_init(lock);
+ INIT_LIST_HEAD(list);
+
+ for (i = 0; i < size; i++)
+ INIT_HLIST_HEAD(&kmem_table[i]);
+
+ return (0);
+}
+
+static void
+spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+{
+ unsigned long flags;
+ kmem_debug_t *kd;
+ char str[17];
+
+ spin_lock_irqsave(lock, flags);
+ if (!list_empty(list))
+ printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
+ "size", "data", "func", "line");
+
+ list_for_each_entry(kd, list, kd_list) {
+ printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
+ (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
+ kd->kd_func, kd->kd_line);
+ }
+
+ spin_unlock_irqrestore(lock, flags);
+}
+#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+
+int
+spl_kmem_init(void)
+{
+
+#ifdef DEBUG_KMEM
+ kmem_alloc_used_set(0);
+
+
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+ return (0);
+}
+
+void
+spl_kmem_fini(void)
+{
+#ifdef DEBUG_KMEM
+ /*
+ * Display all unreclaimed memory addresses, including the
+ * allocation size and the first few bytes of what's located
+ * at that address to aid in debugging. Performance is not
+ * a serious concern here since it is module unload time.
+ */
+ if (kmem_alloc_used_read() != 0)
+ printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
+ (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+
+#ifdef DEBUG_KMEM_TRACKING
+ spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+}
diff --git a/module/os/linux/spl/spl-kobj.c b/module/os/linux/spl/spl-kobj.c
new file mode 100644
index 000000000..7019369bd
--- /dev/null
+++ b/module/os/linux/spl/spl-kobj.c
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kobj Implementation.
+ */
+
+#include <sys/kobj.h>
+
+struct _buf *
+kobj_open_file(const char *name)
+{
+ struct _buf *file;
+ vnode_t *vp;
+ int rc;
+
+ file = kmalloc(sizeof (_buf_t), kmem_flags_convert(KM_SLEEP));
+ if (file == NULL)
+ return ((_buf_t *)-1UL);
+
+ if ((rc = vn_open(name, UIO_SYSSPACE, FREAD, 0644, &vp, 0, 0))) {
+ kfree(file);
+ return ((_buf_t *)-1UL);
+ }
+
+ file->vp = vp;
+
+ return (file);
+} /* kobj_open_file() */
+EXPORT_SYMBOL(kobj_open_file);
+
+void
+kobj_close_file(struct _buf *file)
+{
+ VOP_CLOSE(file->vp, 0, 0, 0, 0, 0);
+ kfree(file);
+} /* kobj_close_file() */
+EXPORT_SYMBOL(kobj_close_file);
+
+int
+kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off)
+{
+ ssize_t resid;
+
+ if (vn_rdwr(UIO_READ, file->vp, buf, size, (offset_t)off,
+ UIO_SYSSPACE, 0, 0, 0, &resid) != 0)
+ return (-1);
+
+ return (size - resid);
+} /* kobj_read_file() */
+EXPORT_SYMBOL(kobj_read_file);
+
+int
+kobj_get_filesize(struct _buf *file, uint64_t *size)
+{
+ vattr_t vap;
+ int rc;
+
+ rc = VOP_GETATTR(file->vp, &vap, 0, 0, NULL);
+ if (rc)
+ return (rc);
+
+ *size = vap.va_size;
+
+ return (rc);
+} /* kobj_get_filesize() */
+EXPORT_SYMBOL(kobj_get_filesize);
diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c
new file mode 100644
index 000000000..1f67bf157
--- /dev/null
+++ b/module/os/linux/spl/spl-kstat.c
@@ -0,0 +1,770 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Kstat Implementation.
+ */
+
+#include <linux/seq_file.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+static kmutex_t kstat_module_lock;
+static struct list_head kstat_module_list;
+static kid_t kstat_id;
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+ if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+ return (ENOMEM);
+
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+
+ return (0);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt++;
+ if (wcnt != 0) {
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_waitq_enter);
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t wcnt;
+
+ new = gethrtime();
+ delta = new - kiop->wlastupdate;
+ kiop->wlastupdate = new;
+ wcnt = kiop->wcnt--;
+ ASSERT((int)wcnt > 0);
+ kiop->wlentime += delta * wcnt;
+ kiop->wtime += delta;
+}
+EXPORT_SYMBOL(kstat_waitq_exit);
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt++;
+ if (rcnt != 0) {
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+ }
+}
+EXPORT_SYMBOL(kstat_runq_enter);
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+ hrtime_t new, delta;
+ ulong_t rcnt;
+
+ new = gethrtime();
+ delta = new - kiop->rlastupdate;
+ kiop->rlastupdate = new;
+ rcnt = kiop->rcnt--;
+ ASSERT((int)rcnt > 0);
+ kiop->rlentime += delta * rcnt;
+ kiop->rtime += delta;
+}
+EXPORT_SYMBOL(kstat_runq_exit);
+
+static int
+kstat_seq_show_headers(struct seq_file *f)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
+ ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
+ ksp->ks_ndata, (int)ksp->ks_data_size,
+ ksp->ks_crtime, ksp->ks_snaptime);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.headers) {
+ rc = ksp->ks_raw_ops.headers(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ seq_printf(f, "raw data\n");
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ seq_printf(f, "%-31s %-4s %s\n",
+ "name", "type", "data");
+ break;
+ case KSTAT_TYPE_INTR:
+ seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
+ "hard", "soft", "watchdog",
+ "spurious", "multsvc");
+ break;
+ case KSTAT_TYPE_IO:
+ seq_printf(f,
+ "%-8s %-8s %-8s %-8s %-8s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s %-8s\n",
+ "nread", "nwritten", "reads", "writes",
+ "wtime", "wlentime", "wupdate",
+ "rtime", "rlentime", "rupdate",
+ "wcnt", "rcnt");
+ break;
+ case KSTAT_TYPE_TIMER:
+ seq_printf(f,
+ "%-31s %-8s "
+ "%-8s %-8s %-8s %-8s %-8s\n",
+ "name", "events", "elapsed",
+ "min", "max", "start", "stop");
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
+{
+ int i, j;
+
+ for (i = 0; ; i++) {
+ seq_printf(f, "%03x:", i);
+
+ for (j = 0; j < 16; j++) {
+ if (i * 16 + j >= l) {
+ seq_printf(f, "\n");
+ goto out;
+ }
+
+ seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
+ }
+ seq_printf(f, "\n");
+ }
+out:
+ return (0);
+}
+
+static int
+kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
+{
+ seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
+
+ switch (knp->data_type) {
+ case KSTAT_DATA_CHAR:
+ knp->value.c[15] = '\0'; /* NULL terminate */
+ seq_printf(f, "%-16s", knp->value.c);
+ break;
+ /*
+ * NOTE - We need to be more careful able what tokens are
+ * used for each arch, for now this is correct for x86_64.
+ */
+ case KSTAT_DATA_INT32:
+ seq_printf(f, "%d", knp->value.i32);
+ break;
+ case KSTAT_DATA_UINT32:
+ seq_printf(f, "%u", knp->value.ui32);
+ break;
+ case KSTAT_DATA_INT64:
+ seq_printf(f, "%lld", (signed long long)knp->value.i64);
+ break;
+ case KSTAT_DATA_UINT64:
+ seq_printf(f, "%llu",
+ (unsigned long long)knp->value.ui64);
+ break;
+ case KSTAT_DATA_LONG:
+ seq_printf(f, "%ld", knp->value.l);
+ break;
+ case KSTAT_DATA_ULONG:
+ seq_printf(f, "%lu", knp->value.ul);
+ break;
+ case KSTAT_DATA_STRING:
+ KSTAT_NAMED_STR_PTR(knp)
+ [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
+ seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
+ break;
+ default:
+ PANIC("Undefined kstat data type %d\n", knp->data_type);
+ }
+
+ seq_printf(f, "\n");
+
+ return (0);
+}
+
+static int
+kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
+{
+ seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
+ kip->intrs[KSTAT_INTR_HARD],
+ kip->intrs[KSTAT_INTR_SOFT],
+ kip->intrs[KSTAT_INTR_WATCHDOG],
+ kip->intrs[KSTAT_INTR_SPURIOUS],
+ kip->intrs[KSTAT_INTR_MULTSVC]);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
+{
+ /* though wlentime & friends are signed, they will never be negative */
+ seq_printf(f,
+ "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
+ "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
+ kip->nread, kip->nwritten,
+ kip->reads, kip->writes,
+ kip->wtime, kip->wlentime, kip->wlastupdate,
+ kip->rtime, kip->rlentime, kip->rlastupdate,
+ kip->wcnt, kip->rcnt);
+
+ return (0);
+}
+
+static int
+kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
+{
+ seq_printf(f,
+ "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n",
+ ktp->name, ktp->num_events, ktp->elapsed_time,
+ ktp->min_time, ktp->max_time,
+ ktp->start_time, ktp->stop_time);
+
+ return (0);
+}
+
+static int
+kstat_seq_show(struct seq_file *f, void *p)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ int rc = 0;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+restart:
+ if (ksp->ks_raw_ops.data) {
+ rc = ksp->ks_raw_ops.data(
+ ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
+ if (rc == ENOMEM && !kstat_resize_raw(ksp))
+ goto restart;
+ if (!rc)
+ seq_puts(f, ksp->ks_raw_buf);
+ } else {
+ ASSERT(ksp->ks_ndata == 1);
+ rc = kstat_seq_show_raw(f, ksp->ks_data,
+ ksp->ks_data_size);
+ }
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = kstat_seq_show_named(f, (kstat_named_t *)p);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = kstat_seq_show_io(f, (kstat_io_t *)p);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (-rc);
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+ ASSERT(ksp != NULL);
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+
+ return (0);
+}
+
+static void *
+kstat_seq_data_addr(kstat_t *ksp, loff_t n)
+{
+ void *rc = NULL;
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ if (ksp->ks_raw_ops.addr)
+ rc = ksp->ks_raw_ops.addr(ksp, n);
+ else
+ rc = ksp->ks_data;
+ break;
+ case KSTAT_TYPE_NAMED:
+ rc = ksp->ks_data + n * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ rc = ksp->ks_data + n * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ rc = ksp->ks_data + n * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ rc = ksp->ks_data + n * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ return (rc);
+}
+
+static void *
+kstat_seq_start(struct seq_file *f, loff_t *pos)
+{
+ loff_t n = *pos;
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW) {
+ ksp->ks_raw_bufsize = PAGE_SIZE;
+ ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+ }
+
+ /* Dynamically update kstat, on error existing kstats are used */
+ (void) ksp->ks_update(ksp, KSTAT_READ);
+
+ ksp->ks_snaptime = gethrtime();
+
+ if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n &&
+ kstat_seq_show_headers(f))
+ return (NULL);
+
+ if (n >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, n));
+}
+
+static void *
+kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ ++*pos;
+ if (*pos >= ksp->ks_ndata)
+ return (NULL);
+
+ return (kstat_seq_data_addr(ksp, *pos));
+}
+
+static void
+kstat_seq_stop(struct seq_file *f, void *v)
+{
+ kstat_t *ksp = (kstat_t *)f->private;
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ if (ksp->ks_type == KSTAT_TYPE_RAW)
+ vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+
+ mutex_exit(ksp->ks_lock);
+}
+
+static struct seq_operations kstat_seq_ops = {
+ .show = kstat_seq_show,
+ .start = kstat_seq_start,
+ .next = kstat_seq_next,
+ .stop = kstat_seq_stop,
+};
+
+static kstat_module_t *
+kstat_find_module(char *name)
+{
+ kstat_module_t *module;
+
+ list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
+ if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
+ return (module);
+ }
+
+ return (NULL);
+}
+
+static kstat_module_t *
+kstat_create_module(char *name)
+{
+ kstat_module_t *module;
+ struct proc_dir_entry *pde;
+
+ pde = proc_mkdir(name, proc_spl_kstat);
+ if (pde == NULL)
+ return (NULL);
+
+ module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
+ module->ksm_proc = pde;
+ strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+ INIT_LIST_HEAD(&module->ksm_kstat_list);
+ list_add_tail(&module->ksm_module_list, &kstat_module_list);
+
+ return (module);
+
+}
+
+static void
+kstat_delete_module(kstat_module_t *module)
+{
+ ASSERT(list_empty(&module->ksm_kstat_list));
+ remove_proc_entry(module->ksm_name, proc_spl_kstat);
+ list_del(&module->ksm_module_list);
+ kmem_free(module, sizeof (kstat_module_t));
+}
+
+static int
+proc_kstat_open(struct inode *inode, struct file *filp)
+{
+ struct seq_file *f;
+ int rc;
+
+ rc = seq_open(filp, &kstat_seq_ops);
+ if (rc)
+ return (rc);
+
+ f = filp->private_data;
+ f->private = PDE_DATA(inode);
+
+ return (rc);
+}
+
+static ssize_t
+proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct seq_file *f = filp->private_data;
+ kstat_t *ksp = f->private;
+ int rc;
+
+ ASSERT(ksp->ks_magic == KS_MAGIC);
+
+ mutex_enter(ksp->ks_lock);
+ rc = ksp->ks_update(ksp, KSTAT_WRITE);
+ mutex_exit(ksp->ks_lock);
+
+ if (rc)
+ return (-rc);
+
+ *ppos += len;
+ return (len);
+}
+
+static struct file_operations proc_kstat_operations = {
+ .open = proc_kstat_open,
+ .write = proc_kstat_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+ int (*headers)(char *buf, size_t size),
+ int (*data)(char *buf, size_t size, void *data),
+ void *(*addr)(kstat_t *ksp, loff_t index))
+{
+ ksp->ks_raw_ops.headers = headers;
+ ksp->ks_raw_ops.data = data;
+ ksp->ks_raw_ops.addr = addr;
+}
+EXPORT_SYMBOL(__kstat_set_raw_ops);
+
+void
+kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
+ const char *name)
+{
+ kpep->kpe_owner = NULL;
+ kpep->kpe_proc = NULL;
+ INIT_LIST_HEAD(&kpep->kpe_list);
+ strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
+ strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
+}
+EXPORT_SYMBOL(kstat_proc_entry_init);
+
+kstat_t *
+__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
+ const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
+ uchar_t ks_flags)
+{
+ kstat_t *ksp;
+
+ ASSERT(ks_module);
+ ASSERT(ks_instance == 0);
+ ASSERT(ks_name);
+
+ if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+ ASSERT(ks_ndata == 1);
+
+ ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
+ if (ksp == NULL)
+ return (ksp);
+
+ mutex_enter(&kstat_module_lock);
+ ksp->ks_kid = kstat_id;
+ kstat_id++;
+ mutex_exit(&kstat_module_lock);
+
+ ksp->ks_magic = KS_MAGIC;
+ mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+ ksp->ks_lock = &ksp->ks_private_lock;
+
+ ksp->ks_crtime = gethrtime();
+ ksp->ks_snaptime = ksp->ks_crtime;
+ ksp->ks_instance = ks_instance;
+ strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+ ksp->ks_type = ks_type;
+ ksp->ks_flags = ks_flags;
+ ksp->ks_update = kstat_default_update;
+ ksp->ks_private = NULL;
+ ksp->ks_raw_ops.headers = NULL;
+ ksp->ks_raw_ops.data = NULL;
+ ksp->ks_raw_ops.addr = NULL;
+ ksp->ks_raw_buf = NULL;
+ ksp->ks_raw_bufsize = 0;
+ kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name);
+
+ switch (ksp->ks_type) {
+ case KSTAT_TYPE_RAW:
+ ksp->ks_ndata = 1;
+ ksp->ks_data_size = ks_ndata;
+ break;
+ case KSTAT_TYPE_NAMED:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+ break;
+ case KSTAT_TYPE_INTR:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+ break;
+ case KSTAT_TYPE_IO:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+ break;
+ case KSTAT_TYPE_TIMER:
+ ksp->ks_ndata = ks_ndata;
+ ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+ break;
+ default:
+ PANIC("Undefined kstat type %d\n", ksp->ks_type);
+ }
+
+ if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+ ksp->ks_data = NULL;
+ } else {
+ ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+ if (ksp->ks_data == NULL) {
+ kmem_free(ksp, sizeof (*ksp));
+ ksp = NULL;
+ }
+ }
+
+ return (ksp);
+}
+EXPORT_SYMBOL(__kstat_create);
+
+static int
+kstat_detect_collision(kstat_proc_entry_t *kpep)
+{
+ kstat_module_t *module;
+ kstat_proc_entry_t *tmp;
+ char *parent;
+ char *cp;
+
+ parent = kmem_asprintf("%s", kpep->kpe_module);
+
+ if ((cp = strrchr(parent, '/')) == NULL) {
+ strfree(parent);
+ return (0);
+ }
+
+ cp[0] = '\0';
+ if ((module = kstat_find_module(parent)) != NULL) {
+ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+ if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) {
+ strfree(parent);
+ return (EEXIST);
+ }
+ }
+ }
+
+ strfree(parent);
+ return (0);
+}
+
+/*
+ * Add a file to the proc filesystem under the kstat namespace (i.e.
+ * /proc/spl/kstat/). The file need not necessarily be implemented as a
+ * kstat.
+ */
+void
+kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode,
+ const struct file_operations *file_ops, void *data)
+{
+ kstat_module_t *module;
+ kstat_proc_entry_t *tmp;
+
+ ASSERT(kpep);
+
+ mutex_enter(&kstat_module_lock);
+
+ module = kstat_find_module(kpep->kpe_module);
+ if (module == NULL) {
+ if (kstat_detect_collision(kpep) != 0) {
+ cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
+ " collision", kpep->kpe_module, kpep->kpe_name);
+ goto out;
+ }
+ module = kstat_create_module(kpep->kpe_module);
+ if (module == NULL)
+ goto out;
+ }
+
+ /*
+ * Only one entry by this name per-module, on failure the module
+ * shouldn't be deleted because we know it has at least one entry.
+ */
+ list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+ if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0)
+ goto out;
+ }
+
+ list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list);
+
+ kpep->kpe_owner = module;
+ kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode,
+ module->ksm_proc, file_ops, data);
+ if (kpep->kpe_proc == NULL) {
+ list_del_init(&kpep->kpe_list);
+ if (list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ }
+out:
+ mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_install);
+
+void
+__kstat_install(kstat_t *ksp)
+{
+ ASSERT(ksp);
+ mode_t mode;
+ /* Specify permission modes for different kstats */
+ if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) {
+ mode = 0600;
+ } else {
+ mode = 0644;
+ }
+ kstat_proc_entry_install(
+ &ksp->ks_proc, mode, &proc_kstat_operations, ksp);
+}
+EXPORT_SYMBOL(__kstat_install);
+
+void
+kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
+{
+ kstat_module_t *module = kpep->kpe_owner;
+ if (kpep->kpe_proc)
+ remove_proc_entry(kpep->kpe_name, module->ksm_proc);
+
+ mutex_enter(&kstat_module_lock);
+ list_del_init(&kpep->kpe_list);
+
+ /*
+ * Remove top level module directory if it wasn't empty before, but now
+ * is.
+ */
+ if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list))
+ kstat_delete_module(module);
+ mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_delete);
+
+void
+__kstat_delete(kstat_t *ksp)
+{
+ kstat_proc_entry_delete(&ksp->ks_proc);
+
+ if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+ kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+ ksp->ks_lock = NULL;
+ mutex_destroy(&ksp->ks_private_lock);
+ kmem_free(ksp, sizeof (*ksp));
+}
+EXPORT_SYMBOL(__kstat_delete);
+
+int
+spl_kstat_init(void)
+{
+ mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&kstat_module_list);
+ kstat_id = 0;
+ return (0);
+}
+
+void
+spl_kstat_fini(void)
+{
+ ASSERT(list_empty(&kstat_module_list));
+ mutex_destroy(&kstat_module_lock);
+}
diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c
new file mode 100644
index 000000000..a75bcc214
--- /dev/null
+++ b/module/os/linux/spl/spl-proc.c
@@ -0,0 +1,782 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Proc Implementation.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/proc.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+
+#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+typedef struct ctl_table __no_const spl_ctl_table;
+#else
+typedef struct ctl_table spl_ctl_table;
+#endif
+
+static unsigned long table_min = 0;
+static unsigned long table_max = ~0;
+
+static struct ctl_table_header *spl_header = NULL;
+static struct proc_dir_entry *proc_spl = NULL;
+static struct proc_dir_entry *proc_spl_kmem = NULL;
+static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
+static struct proc_dir_entry *proc_spl_taskq_all = NULL;
+static struct proc_dir_entry *proc_spl_taskq = NULL;
+struct proc_dir_entry *proc_spl_kstat = NULL;
+
+static int
+proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer,
+ int ubuffer_size)
+{
+ int size;
+
+ if (ubuffer_size > kbuffer_size)
+ return (-EOVERFLOW);
+
+ if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size))
+ return (-EFAULT);
+
+ /* strip trailing whitespace */
+ size = strnlen(kbuffer, ubuffer_size);
+ while (size-- >= 0)
+ if (!isspace(kbuffer[size]))
+ break;
+
+ /* empty string */
+ if (size < 0)
+ return (-EINVAL);
+
+ /* no space to terminate */
+ if (size == kbuffer_size)
+ return (-EOVERFLOW);
+
+ kbuffer[size + 1] = 0;
+ return (0);
+}
+
+static int
+proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer,
+ char *append)
+{
+ /*
+ * NB if 'append' != NULL, it's a single character to append to the
+ * copied out string - usually "\n", for /proc entries and
+ * (i.e. a terminating zero byte) for sysctl entries
+ */
+ int size = MIN(strlen(kbuffer), ubuffer_size);
+
+ if (copy_to_user(ubuffer, kbuffer, size))
+ return (-EFAULT);
+
+ if (append != NULL && size < ubuffer_size) {
+ if (copy_to_user(ubuffer + size, append, 1))
+ return (-EFAULT);
+
+ size++;
+ }
+
+ return (size);
+}
+
+#ifdef DEBUG_KMEM
+static int
+proc_domemused(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val;
+ spl_ctl_table dummy = *table;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+#ifdef HAVE_ATOMIC64_T
+ val = atomic64_read((atomic64_t *)table->data);
+#else
+ val = atomic_read((atomic_t *)table->data);
+#endif /* HAVE_ATOMIC64_T */
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+#endif /* DEBUG_KMEM */
+
+static int
+proc_doslab(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int rc = 0;
+ unsigned long min = 0, max = ~0, val = 0, mask;
+ spl_ctl_table dummy = *table;
+ spl_kmem_cache_t *skc;
+
+ dummy.data = &val;
+ dummy.proc_handler = &proc_dointvec;
+ dummy.extra1 = &min;
+ dummy.extra2 = &max;
+
+ if (write) {
+ *ppos += *lenp;
+ } else {
+ down_read(&spl_kmem_cache_sem);
+ mask = (unsigned long)table->data;
+
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+
+ /* Only use slabs of the correct kmem/vmem type */
+ if (!(skc->skc_flags & mask))
+ continue;
+
+ /* Sum the specified field for selected slabs */
+ switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
+ case KMC_TOTAL:
+ val += skc->skc_slab_size * skc->skc_slab_total;
+ break;
+ case KMC_ALLOC:
+ val += skc->skc_obj_size * skc->skc_obj_alloc;
+ break;
+ case KMC_MAX:
+ val += skc->skc_obj_size * skc->skc_obj_max;
+ break;
+ }
+ }
+
+ up_read(&spl_kmem_cache_sem);
+ rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+ }
+
+ return (rc);
+}
+
+static int
+proc_dohostid(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int len, rc = 0;
+ char *end, str[32];
+
+ if (write) {
+ /*
+ * We can't use proc_doulongvec_minmax() in the write
+ * case here because hostid while a hex value has no
+ * leading 0x which confuses the helper function.
+ */
+ rc = proc_copyin_string(str, sizeof (str), buffer, *lenp);
+ if (rc < 0)
+ return (rc);
+
+ spl_hostid = simple_strtoul(str, &end, 16);
+ if (str == end)
+ return (-EINVAL);
+
+ } else {
+ len = snprintf(str, sizeof (str), "%lx",
+ (unsigned long) zone_get_hostid(NULL));
+ if (*ppos >= len)
+ rc = 0;
+ else
+ rc = proc_copyout_string(buffer,
+ *lenp, str + *ppos, "\n");
+
+ if (rc >= 0) {
+ *lenp = rc;
+ *ppos += rc;
+ }
+ }
+
+ return (rc);
+}
+
+static void
+taskq_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
+ "taskq", "act", "nthr", "spwn", "maxt", "pri",
+ "mina", "maxa", "cura", "flags");
+}
+
+/* indices into the lheads array below */
+#define LHEAD_PEND 0
+#define LHEAD_PRIO 1
+#define LHEAD_DELAY 2
+#define LHEAD_WAIT 3
+#define LHEAD_ACTIVE 4
+#define LHEAD_SIZE 5
+
+/* BEGIN CSTYLED */
+static unsigned int spl_max_show_tasks = 512;
+module_param(spl_max_show_tasks, uint, 0644);
+MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
+/* END CSTYLED */
+
+static int
+taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
+{
+ taskq_t *tq = p;
+ taskq_thread_t *tqt;
+ spl_wait_queue_entry_t *wq;
+ struct task_struct *tsk;
+ taskq_ent_t *tqe;
+ char name[100];
+ struct list_head *lheads[LHEAD_SIZE], *lh;
+ static char *list_names[LHEAD_SIZE] =
+ {"pend", "prio", "delay", "wait", "active" };
+ int i, j, have_lheads = 0;
+ unsigned long wflags, flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
+
+ /* get the various lists and check whether they're empty */
+ lheads[LHEAD_PEND] = &tq->tq_pend_list;
+ lheads[LHEAD_PRIO] = &tq->tq_prio_list;
+ lheads[LHEAD_DELAY] = &tq->tq_delay_list;
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
+#else
+ lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
+#endif
+ lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
+
+ for (i = 0; i < LHEAD_SIZE; ++i) {
+ if (list_empty(lheads[i]))
+ lheads[i] = NULL;
+ else
+ ++have_lheads;
+ }
+
+ /* early return in non-"all" mode if lists are all empty */
+ if (!allflag && !have_lheads) {
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return (0);
+ }
+
+ /* unlock the waitq quickly */
+ if (!lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+
+ /* show the base taskq contents */
+ snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
+ seq_printf(f, "%-25s ", name);
+ seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
+ tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
+ tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
+ tq->tq_nalloc, tq->tq_flags);
+
+ /* show the active list */
+ if (lheads[LHEAD_ACTIVE]) {
+ j = 0;
+ list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[LHEAD_ACTIVE]);
+ else if (j == 2) {
+ seq_printf(f, "\n\t ");
+ j = 0;
+ }
+ seq_printf(f, " [%d]%pf(%ps)",
+ tqt->tqt_thread->pid,
+ tqt->tqt_task->tqent_func,
+ tqt->tqt_task->tqent_arg);
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+
+ for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
+ if (lheads[i]) {
+ j = 0;
+ list_for_each(lh, lheads[i]) {
+ if (spl_max_show_tasks != 0 &&
+ j >= spl_max_show_tasks) {
+ seq_printf(f, "\n\t(truncated)");
+ break;
+ }
+ /* show the wait waitq list */
+ if (i == LHEAD_WAIT) {
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, entry);
+#else
+ wq = list_entry(lh,
+ spl_wait_queue_entry_t, task_list);
+#endif
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 8 == 0)
+ seq_printf(f, "\n\t ");
+
+ tsk = wq->private;
+ seq_printf(f, " %d", tsk->pid);
+ /* pend, prio and delay lists */
+ } else {
+ tqe = list_entry(lh, taskq_ent_t,
+ tqent_list);
+ if (j == 0)
+ seq_printf(f, "\t%s:",
+ list_names[i]);
+ else if (j % 2 == 0)
+ seq_printf(f, "\n\t ");
+
+ seq_printf(f, " %pf(%ps)",
+ tqe->tqent_func,
+ tqe->tqent_arg);
+ }
+ ++j;
+ }
+ seq_printf(f, "\n");
+ }
+ if (lheads[LHEAD_WAIT])
+ spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (0);
+}
+
+static int
+taskq_all_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_TRUE));
+}
+
+static int
+taskq_seq_show(struct seq_file *f, void *p)
+{
+ return (taskq_seq_show_impl(f, p, B_FALSE));
+}
+
+static void *
+taskq_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&tq_list_sem);
+ if (!n)
+ taskq_seq_show_headers(f);
+
+ p = tq_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &tq_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, taskq_t, tq_taskqs));
+}
+
+static void *
+taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ taskq_t *tq = p;
+
+ ++*pos;
+ return ((tq->tq_taskqs.next == &tq_list) ?
+ NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
+}
+
+static void
+slab_seq_show_headers(struct seq_file *f)
+{
+ seq_printf(f,
+ "--------------------- cache ----------"
+ "--------------------------------------------- "
+ "----- slab ------ "
+ "---- object ----- "
+ "--- emergency ---\n");
+ seq_printf(f,
+ "name "
+ " flags size alloc slabsize objsize "
+ "total alloc max "
+ "total alloc max "
+ "dlock alloc max\n");
+}
+
+static int
+slab_seq_show(struct seq_file *f, void *p)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ASSERT(skc->skc_magic == SKC_MAGIC);
+
+ /*
+ * Backed by Linux slab see /proc/slabinfo.
+ */
+ if (skc->skc_flags & KMC_SLAB)
+ return (0);
+
+ spin_lock(&skc->skc_lock);
+ seq_printf(f, "%-36s ", skc->skc_name);
+ seq_printf(f, "0x%05lx %9lu %9lu %8u %8u "
+ "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n",
+ (long unsigned)skc->skc_flags,
+ (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
+ (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
+ (unsigned)skc->skc_slab_size,
+ (unsigned)skc->skc_obj_size,
+ (long unsigned)skc->skc_slab_total,
+ (long unsigned)skc->skc_slab_alloc,
+ (long unsigned)skc->skc_slab_max,
+ (long unsigned)skc->skc_obj_total,
+ (long unsigned)skc->skc_obj_alloc,
+ (long unsigned)skc->skc_obj_max,
+ (long unsigned)skc->skc_obj_deadlock,
+ (long unsigned)skc->skc_obj_emergency,
+ (long unsigned)skc->skc_obj_emergency_max);
+
+ spin_unlock(&skc->skc_lock);
+
+ return (0);
+}
+
+static void *
+slab_seq_start(struct seq_file *f, loff_t *pos)
+{
+ struct list_head *p;
+ loff_t n = *pos;
+
+ down_read(&spl_kmem_cache_sem);
+ if (!n)
+ slab_seq_show_headers(f);
+
+ p = spl_kmem_cache_list.next;
+ while (n--) {
+ p = p->next;
+ if (p == &spl_kmem_cache_list)
+ return (NULL);
+ }
+
+ return (list_entry(p, spl_kmem_cache_t, skc_list));
+}
+
+static void *
+slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ spl_kmem_cache_t *skc = p;
+
+ ++*pos;
+ return ((skc->skc_list.next == &spl_kmem_cache_list) ?
+ NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
+}
+
+static void
+slab_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&spl_kmem_cache_sem);
+}
+
+static struct seq_operations slab_seq_ops = {
+ .show = slab_seq_show,
+ .start = slab_seq_start,
+ .next = slab_seq_next,
+ .stop = slab_seq_stop,
+};
+
+static int
+proc_slab_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &slab_seq_ops));
+}
+
+static struct file_operations proc_slab_operations = {
+ .open = proc_slab_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void
+taskq_seq_stop(struct seq_file *f, void *v)
+{
+ up_read(&tq_list_sem);
+}
+
+static struct seq_operations taskq_all_seq_ops = {
+ .show = taskq_all_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static struct seq_operations taskq_seq_ops = {
+ .show = taskq_seq_show,
+ .start = taskq_seq_start,
+ .next = taskq_seq_next,
+ .stop = taskq_seq_stop,
+};
+
+static int
+proc_taskq_all_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_all_seq_ops));
+}
+
+static int
+proc_taskq_open(struct inode *inode, struct file *filp)
+{
+ return (seq_open(filp, &taskq_seq_ops));
+}
+
+static struct file_operations proc_taskq_all_operations = {
+ .open = proc_taskq_all_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct file_operations proc_taskq_operations = {
+ .open = proc_taskq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
+ {
+ .procname = "kmem_used",
+ .data = &kmem_alloc_used,
+#ifdef HAVE_ATOMIC64_T
+ .maxlen = sizeof (atomic64_t),
+#else
+ .maxlen = sizeof (atomic_t),
+#endif /* HAVE_ATOMIC64_T */
+ .mode = 0444,
+ .proc_handler = &proc_domemused,
+ },
+ {
+ .procname = "kmem_max",
+ .data = &kmem_alloc_max,
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doulongvec_minmax,
+ },
+#endif /* DEBUG_KMEM */
+ {
+ .procname = "slab_kmem_total",
+ .data = (void *)(KMC_KMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kmem_alloc",
+ .data = (void *)(KMC_KMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_kmem_max",
+ .data = (void *)(KMC_KMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_total",
+ .data = (void *)(KMC_VMEM | KMC_TOTAL),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_alloc",
+ .data = (void *)(KMC_VMEM | KMC_ALLOC),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {
+ .procname = "slab_vmem_max",
+ .data = (void *)(KMC_VMEM | KMC_MAX),
+ .maxlen = sizeof (unsigned long),
+ .extra1 = &table_min,
+ .extra2 = &table_max,
+ .mode = 0444,
+ .proc_handler = &proc_doslab,
+ },
+ {},
+};
+
+static struct ctl_table spl_kstat_table[] = {
+ {},
+};
+
+static struct ctl_table spl_table[] = {
+ /*
+ * NB No .strategy entries have been provided since
+ * sysctl(8) prefers to go via /proc for portability.
+ */
+ {
+ .procname = "gitrev",
+ .data = spl_gitrev,
+ .maxlen = sizeof (spl_gitrev),
+ .mode = 0444,
+ .proc_handler = &proc_dostring,
+ },
+ {
+ .procname = "hostid",
+ .data = &spl_hostid,
+ .maxlen = sizeof (unsigned long),
+ .mode = 0644,
+ .proc_handler = &proc_dohostid,
+ },
+ {
+ .procname = "kmem",
+ .mode = 0555,
+ .child = spl_kmem_table,
+ },
+ {
+ .procname = "kstat",
+ .mode = 0555,
+ .child = spl_kstat_table,
+ },
+ {},
+};
+
+static struct ctl_table spl_dir[] = {
+ {
+ .procname = "spl",
+ .mode = 0555,
+ .child = spl_table,
+ },
+ {}
+};
+
+static struct ctl_table spl_root[] = {
+ {
+#ifdef HAVE_CTL_NAME
+ .ctl_name = CTL_KERN,
+#endif
+ .procname = "kernel",
+ .mode = 0555,
+ .child = spl_dir,
+ },
+ {}
+};
+
+int
+spl_proc_init(void)
+{
+ int rc = 0;
+
+ spl_header = register_sysctl_table(spl_root);
+ if (spl_header == NULL)
+ return (-EUNATCH);
+
+ proc_spl = proc_mkdir("spl", NULL);
+ if (proc_spl == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
+ &proc_taskq_all_operations, NULL);
+ if (proc_spl_taskq_all == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
+ &proc_taskq_operations, NULL);
+ if (proc_spl_taskq == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem = proc_mkdir("kmem", proc_spl);
+ if (proc_spl_kmem == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
+ &proc_slab_operations, NULL);
+ if (proc_spl_kmem_slab == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+
+ proc_spl_kstat = proc_mkdir("kstat", proc_spl);
+ if (proc_spl_kstat == NULL) {
+ rc = -EUNATCH;
+ goto out;
+ }
+out:
+ if (rc) {
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+ unregister_sysctl_table(spl_header);
+ }
+
+ return (rc);
+}
+
+void
+spl_proc_fini(void)
+{
+ remove_proc_entry("kstat", proc_spl);
+ remove_proc_entry("slab", proc_spl_kmem);
+ remove_proc_entry("kmem", proc_spl);
+ remove_proc_entry("taskq-all", proc_spl);
+ remove_proc_entry("taskq", proc_spl);
+ remove_proc_entry("spl", NULL);
+
+ ASSERT(spl_header != NULL);
+ unregister_sysctl_table(spl_header);
+}
diff --git a/module/os/linux/spl/spl-procfs-list.c b/module/os/linux/spl/spl-procfs-list.c
new file mode 100644
index 000000000..f6a00da5c
--- /dev/null
+++ b/module/os/linux/spl/spl-procfs-list.c
@@ -0,0 +1,257 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+#include <linux/proc_fs.h>
+
+/*
+ * A procfs_list is a wrapper around a linked list which implements the seq_file
+ * interface, allowing the contents of the list to be exposed through procfs.
+ * The kernel already has some utilities to help implement the seq_file
+ * interface for linked lists (seq_list_*), but they aren't appropriate for use
+ * with lists that have many entries, because seq_list_start walks the list at
+ * the start of each read syscall to find where it left off, so reading a file
+ * ends up being quadratic in the number of entries in the list.
+ *
+ * This implementation avoids this penalty by maintaining a separate cursor into
+ * the list per instance of the file that is open. It also maintains some extra
+ * information in each node of the list to prevent reads of entries that have
+ * been dropped from the list.
+ *
+ * Callers should only add elements to the list using procfs_list_add, which
+ * adds an element to the tail of the list. Other operations can be performed
+ * directly on the wrapped list using the normal list manipulation functions,
+ * but elements should only be removed from the head of the list.
+ */
+
+#define NODE_ID(procfs_list, obj) \
+ (((procfs_list_node_t *)(((char *)obj) + \
+ (procfs_list)->pl_node_offset))->pln_id)
+
+typedef struct procfs_list_cursor {
+ procfs_list_t *procfs_list; /* List into which this cursor points */
+ void *cached_node; /* Most recently accessed node */
+ loff_t cached_pos; /* Position of cached_node */
+} procfs_list_cursor_t;
+
+static int
+procfs_list_seq_show(struct seq_file *f, void *p)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+
+ ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+ if (p == SEQ_START_TOKEN) {
+ if (procfs_list->pl_show_header != NULL)
+ return (procfs_list->pl_show_header(f));
+ else
+ return (0);
+ }
+ return (procfs_list->pl_show(f, p));
+}
+
+static void *
+procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos)
+{
+ void *next_node;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+
+ if (cursor->cached_node == SEQ_START_TOKEN)
+ next_node = list_head(&procfs_list->pl_list);
+ else
+ next_node = list_next(&procfs_list->pl_list,
+ cursor->cached_node);
+
+ if (next_node != NULL) {
+ cursor->cached_node = next_node;
+ cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node);
+ *pos = cursor->cached_pos;
+ }
+ return (next_node);
+}
+
+static void *
+procfs_list_seq_start(struct seq_file *f, loff_t *pos)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+
+ mutex_enter(&procfs_list->pl_lock);
+
+ if (*pos == 0) {
+ cursor->cached_node = SEQ_START_TOKEN;
+ cursor->cached_pos = 0;
+ return (SEQ_START_TOKEN);
+ }
+
+ /*
+ * Check if our cached pointer has become stale, which happens if the
+ * the message where we left off has been dropped from the list since
+ * the last read syscall completed.
+ */
+ void *oldest_node = list_head(&procfs_list->pl_list);
+ if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL ||
+ NODE_ID(procfs_list, oldest_node) > cursor->cached_pos))
+ return (ERR_PTR(-EIO));
+
+ /*
+ * If it isn't starting from the beginning of the file, the seq_file
+ * code will either pick up at the same position it visited last or the
+ * following one.
+ */
+ if (*pos == cursor->cached_pos) {
+ return (cursor->cached_node);
+ } else {
+ ASSERT3U(*pos, ==, cursor->cached_pos + 1);
+ return (procfs_list_next_node(cursor, pos));
+ }
+}
+
+static void *
+procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock));
+ return (procfs_list_next_node(cursor, pos));
+}
+
+static void
+procfs_list_seq_stop(struct seq_file *f, void *p)
+{
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+ mutex_exit(&procfs_list->pl_lock);
+}
+
+static struct seq_operations procfs_list_seq_ops = {
+ .show = procfs_list_seq_show,
+ .start = procfs_list_seq_start,
+ .next = procfs_list_seq_next,
+ .stop = procfs_list_seq_stop,
+};
+
+static int
+procfs_list_open(struct inode *inode, struct file *filp)
+{
+ int rc = seq_open_private(filp, &procfs_list_seq_ops,
+ sizeof (procfs_list_cursor_t));
+ if (rc != 0)
+ return (rc);
+
+ struct seq_file *f = filp->private_data;
+ procfs_list_cursor_t *cursor = f->private;
+ cursor->procfs_list = PDE_DATA(inode);
+ cursor->cached_node = NULL;
+ cursor->cached_pos = 0;
+
+ return (0);
+}
+
+static ssize_t
+procfs_list_write(struct file *filp, const char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct seq_file *f = filp->private_data;
+ procfs_list_cursor_t *cursor = f->private;
+ procfs_list_t *procfs_list = cursor->procfs_list;
+ int rc;
+
+ if (procfs_list->pl_clear != NULL &&
+ (rc = procfs_list->pl_clear(procfs_list)) != 0)
+ return (-rc);
+ return (len);
+}
+
+static struct file_operations procfs_list_operations = {
+ .owner = THIS_MODULE,
+ .open = procfs_list_open,
+ .write = procfs_list_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
+/*
+ * Initialize a procfs_list and create a file for it in the proc filesystem
+ * under the kstat namespace.
+ */
+void
+procfs_list_install(const char *module,
+ const char *name,
+ mode_t mode,
+ procfs_list_t *procfs_list,
+ int (*show)(struct seq_file *f, void *p),
+ int (*show_header)(struct seq_file *f),
+ int (*clear)(procfs_list_t *procfs_list),
+ size_t procfs_list_node_off)
+{
+ mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&procfs_list->pl_list,
+ procfs_list_node_off + sizeof (procfs_list_node_t),
+ procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+ procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */
+ procfs_list->pl_show = show;
+ procfs_list->pl_show_header = show_header;
+ procfs_list->pl_clear = clear;
+ procfs_list->pl_node_offset = procfs_list_node_off;
+
+ kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name);
+ kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode,
+ &procfs_list_operations, procfs_list);
+}
+EXPORT_SYMBOL(procfs_list_install);
+
+/* Remove the proc filesystem file corresponding to the given list */
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{
+ kstat_proc_entry_delete(&procfs_list->pl_kstat_entry);
+}
+EXPORT_SYMBOL(procfs_list_uninstall);
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+ ASSERT(list_is_empty(&procfs_list->pl_list));
+ list_destroy(&procfs_list->pl_list);
+ mutex_destroy(&procfs_list->pl_lock);
+}
+EXPORT_SYMBOL(procfs_list_destroy);
+
+/*
+ * Add a new node to the tail of the list. While the standard list manipulation
+ * functions can be use for all other operation, adding elements to the list
+ * should only be done using this helper so that the id of the new node is set
+ * correctly.
+ */
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+ ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+ NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+ list_insert_tail(&procfs_list->pl_list, p);
+}
+EXPORT_SYMBOL(procfs_list_add);
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
new file mode 100644
index 000000000..90e1d0a4d
--- /dev/null
+++ b/module/os/linux/spl/spl-taskq.c
@@ -0,0 +1,1292 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Task Queue Implementation.
+ */
+
+#include <sys/timer.h>
+#include <sys/taskq.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+#include <sys/simd.h>
+
+int spl_taskq_thread_bind = 0;
+module_param(spl_taskq_thread_bind, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+
+
+int spl_taskq_thread_dynamic = 1;
+module_param(spl_taskq_thread_dynamic, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+ "Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+ "Create new taskq threads after N sequential tasks");
+
+/* Global system-wide dynamic task queue available for all consumers */
+taskq_t *system_taskq;
+EXPORT_SYMBOL(system_taskq);
+/* Global dynamic task queue for long delay */
+taskq_t *system_delay_taskq;
+EXPORT_SYMBOL(system_delay_taskq);
+
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
+/* List of all taskqs */
+LIST_HEAD(tq_list);
+struct rw_semaphore tq_list_sem;
+static uint_t taskq_tsd;
+
+static int
+task_km_flags(uint_t flags)
+{
+ if (flags & TQ_NOSLEEP)
+ return (KM_NOSLEEP);
+
+ if (flags & TQ_PUSHPAGE)
+ return (KM_PUSHPAGE);
+
+ return (KM_SLEEP);
+}
+
+/*
+ * taskq_find_by_name - Find the largest instance number of a named taskq.
+ */
+static int
+taskq_find_by_name(const char *name)
+{
+ struct list_head *tql;
+ taskq_t *tq;
+
+ list_for_each_prev(tql, &tq_list) {
+ tq = list_entry(tql, taskq_t, tq_taskqs);
+ if (strcmp(name, tq->tq_name) == 0)
+ return (tq->tq_instance);
+ }
+ return (-1);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, returns a list_t which
+ * is not attached to the free, work, or pending taskq lists.
+ */
+static taskq_ent_t *
+task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
+{
+ taskq_ent_t *t;
+ int count = 0;
+
+ ASSERT(tq);
+retry:
+ /* Acquire taskq_ent_t's from free list if available */
+ if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ list_del_init(&t->tqent_list);
+ return (t);
+ }
+
+ /* Free list is empty and memory allocations are prohibited */
+ if (flags & TQ_NOALLOC)
+ return (NULL);
+
+ /* Hit maximum taskq_ent_t pool size */
+ if (tq->tq_nalloc >= tq->tq_maxalloc) {
+ if (flags & TQ_NOSLEEP)
+ return (NULL);
+
+ /*
+ * Sleep periodically polling the free list for an available
+ * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+ * but we cannot block forever waiting for an taskq_ent_t to
+ * show up in the free list, otherwise a deadlock can happen.
+ *
+ * Therefore, we need to allocate a new task even if the number
+ * of allocated tasks is above tq->tq_maxalloc, but we still
+ * end up delaying the task allocation by one second, thereby
+ * throttling the task dispatch rate.
+ */
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ schedule_timeout(HZ / 100);
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
+ tq->tq_lock_class);
+ if (count < 100) {
+ count++;
+ goto retry;
+ }
+ }
+
+ spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+ t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
+ spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
+
+ if (t) {
+ taskq_init_ent(t);
+ tq->tq_nalloc++;
+ }
+
+ return (t);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
+ * to already be removed from the free, work, or pending taskq lists.
+ */
+static void
+task_free(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+ ASSERT(list_empty(&t->tqent_list));
+ ASSERT(!timer_pending(&t->tqent_timer));
+
+ kmem_free(t, sizeof (taskq_ent_t));
+ tq->tq_nalloc--;
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, either destroys the
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
+ */
+static void
+task_done(taskq_t *tq, taskq_ent_t *t)
+{
+ ASSERT(tq);
+ ASSERT(t);
+
+ /* Wake tasks blocked in taskq_wait_id() */
+ wake_up_all(&t->tqent_waitq);
+
+ list_del_init(&t->tqent_list);
+
+ if (tq->tq_nalloc <= tq->tq_minalloc) {
+ t->tqent_id = TASKQID_INVALID;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+
+ list_add_tail(&t->tqent_list, &tq->tq_free_list);
+ } else {
+ task_free(tq, t);
+ }
+}
+
+/*
+ * When a delayed task timer expires remove it from the delay list and
+ * add it to the priority list in order for immediate processing.
+ */
+static void
+task_expire_impl(taskq_ent_t *t)
+{
+ taskq_ent_t *w;
+ taskq_t *tq = t->tqent_taskq;
+ struct list_head *l;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+ if (t->tqent_flags & TQENT_FLAG_CANCEL) {
+ ASSERT(list_empty(&t->tqent_list));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ return;
+ }
+
+ t->tqent_birth = jiffies;
+ /*
+ * The priority list must be maintained in strict task id order
+ * from lowest to highest for lowest_id to be easily calculable.
+ */
+ list_del(&t->tqent_list);
+ list_for_each_prev(l, &tq->tq_prio_list) {
+ w = list_entry(l, taskq_ent_t, tqent_list);
+ if (w->tqent_id < t->tqent_id) {
+ list_add(&t->tqent_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_prio_list)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ wake_up(&tq->tq_work_waitq);
+}
+
+static void
+task_expire(spl_timer_list_t tl)
+{
+ struct timer_list *tmr = (struct timer_list *)tl;
+ taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
+ task_expire_impl(t);
+}
+
+/*
+ * Returns the lowest incomplete taskqid_t. The taskqid_t may
+ * be queued on the pending list, on the priority list, on the
+ * delay list, or on the work list currently being handled, but
+ * it is not 100% complete yet.
+ */
+static taskqid_t
+taskq_lowest_id(taskq_t *tq)
+{
+ taskqid_t lowest_id = tq->tq_next_id;
+ taskq_ent_t *t;
+ taskq_thread_t *tqt;
+
+ ASSERT(tq);
+
+ if (!list_empty(&tq->tq_pend_list)) {
+ t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_prio_list)) {
+ t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_delay_list)) {
+ t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
+ lowest_id = MIN(lowest_id, t->tqent_id);
+ }
+
+ if (!list_empty(&tq->tq_active_list)) {
+ tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+ tqt_active_list);
+ ASSERT(tqt->tqt_id != TASKQID_INVALID);
+ lowest_id = MIN(lowest_id, tqt->tqt_id);
+ }
+
+ return (lowest_id);
+}
+
+/*
+ * Insert a task into a list keeping the list sorted by increasing taskqid.
+ */
+static void
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
+{
+ taskq_thread_t *w;
+ struct list_head *l;
+
+ ASSERT(tq);
+ ASSERT(tqt);
+
+ list_for_each_prev(l, &tq->tq_active_list) {
+ w = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (w->tqt_id < tqt->tqt_id) {
+ list_add(&tqt->tqt_active_list, l);
+ break;
+ }
+ }
+ if (l == &tq->tq_active_list)
+ list_add(&tqt->tqt_active_list, &tq->tq_active_list);
+}
+
+/*
+ * Find and return a task from the given list if it exists. The list
+ * must be in lowest to highest task id order.
+ */
+static taskq_ent_t *
+taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
+{
+ struct list_head *l;
+ taskq_ent_t *t;
+
+ list_for_each(l, lh) {
+ t = list_entry(l, taskq_ent_t, tqent_list);
+
+ if (t->tqent_id == id)
+ return (t);
+
+ if (t->tqent_id > id)
+ break;
+ }
+
+ return (NULL);
+}
+
+/*
+ * Find an already dispatched task given the task id regardless of what
+ * state it is in. If a task is still pending it will be returned.
+ * If a task is executing, then -EBUSY will be returned instead.
+ * If the task has already been run then NULL is returned.
+ */
+static taskq_ent_t *
+taskq_find(taskq_t *tq, taskqid_t id)
+{
+ taskq_thread_t *tqt;
+ struct list_head *l;
+ taskq_ent_t *t;
+
+ t = taskq_find_list(tq, &tq->tq_delay_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_prio_list, id);
+ if (t)
+ return (t);
+
+ t = taskq_find_list(tq, &tq->tq_pend_list, id);
+ if (t)
+ return (t);
+
+ list_for_each(l, &tq->tq_active_list) {
+ tqt = list_entry(l, taskq_thread_t, tqt_active_list);
+ if (tqt->tqt_id == id) {
+ /*
+ * Instead of returning tqt_task, we just return a non
+ * NULL value to prevent misuse, since tqt_task only
+ * has two valid fields.
+ */
+ return (ERR_PTR(-EBUSY));
+ }
+ }
+
+ return (NULL);
+}
+
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id. As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists. As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads. This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists. This value is stored
+ * with the taskq as the lowest id. It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented. Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones. Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
+static int
+taskq_wait_id_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (taskq_find(tq, id) == NULL);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_id() function blocks until the passed task id completes.
+ * This does not guarantee that all lower task ids have completed.
+ */
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_id);
+
+static int
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (id < tq->tq_lowest_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed. Note that all
+ * task id's are assigned monotonically at dispatch time. Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
+{
+ id = id ? id : tq->tq_next_id - 1;
+ wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_outstanding);
+
+static int
+taskq_wait_check(taskq_t *tq)
+{
+ int rc;
+ unsigned long flags;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ rc = (tq->tq_lowest_id == tq->tq_next_id);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+ wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
+}
+EXPORT_SYMBOL(taskq_wait);
+
+int
+taskq_member(taskq_t *tq, kthread_t *t)
+{
+ return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
+}
+EXPORT_SYMBOL(taskq_member);
+
+/*
+ * Cancel an already dispatched task given the task id. Still pending tasks
+ * will be immediately canceled, and if the task is active the function will
+ * block until it completes. Preallocated tasks which are canceled must be
+ * freed by the caller.
+ */
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_ent_t *t;
+ int rc = ENOENT;
+ unsigned long flags;
+
+ ASSERT(tq);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ t = taskq_find(tq, id);
+ if (t && t != ERR_PTR(-EBUSY)) {
+ list_del_init(&t->tqent_list);
+ t->tqent_flags |= TQENT_FLAG_CANCEL;
+
+ /*
+ * When canceling the lowest outstanding task id we
+ * must recalculate the new lowest outstanding id.
+ */
+ if (tq->tq_lowest_id == t->tqent_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
+ }
+
+ /*
+ * The task_expire() function takes the tq->tq_lock so drop
+ * drop the lock before synchronously cancelling the timer.
+ */
+ if (timer_pending(&t->tqent_timer)) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ del_timer_sync(&t->tqent_timer);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ rc = 0;
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ if (t == ERR_PTR(-EBUSY)) {
+ taskq_wait_id(tq, id);
+ rc = EBUSY;
+ }
+
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_cancel_id);
+
+static int taskq_thread_spawn(taskq_t *tq);
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+ taskq_ent_t *t;
+ taskqid_t rc = TASKQID_INVALID;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ /* Do not queue the task unless there is idle thread for it */
+ ASSERT(tq->tq_nactive <= tq->tq_nthreads);
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out;
+ }
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
+ if (flags & TQ_NOQUEUE)
+ list_add(&t->tqent_list, &tq->tq_prio_list);
+ /* Queue to the priority list instead of the pending list */
+ else if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_timer.function = NULL;
+ t->tqent_timer.expires = 0;
+ t->tqent_birth = jiffies;
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch);
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+ uint_t flags, clock_t expire_time)
+{
+ taskqid_t rc = TASKQID_INVALID;
+ taskq_ent_t *t;
+ unsigned long irqflags;
+
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE))
+ goto out;
+
+ if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+ goto out;
+
+ spin_lock(&t->tqent_lock);
+
+ /* Queue to the delay list for subsequent execution */
+ list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+
+ t->tqent_id = rc = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_timer.function = task_expire;
+ t->tqent_timer.expires = (unsigned long)expire_time;
+ add_timer(&t->tqent_timer);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ spin_unlock(&t->tqent_lock);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch_delay);
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+ taskq_ent_t *t)
+{
+ unsigned long irqflags;
+ ASSERT(tq);
+ ASSERT(func);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ /* Taskq being destroyed and all tasks drained */
+ if (!(tq->tq_flags & TASKQ_ACTIVE)) {
+ t->tqent_id = TASKQID_INVALID;
+ goto out;
+ }
+
+ if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+ /* Dynamic taskq may be able to spawn another thread */
+ if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+ taskq_thread_spawn(tq) == 0)
+ goto out2;
+ flags |= TQ_FRONT;
+ }
+
+ spin_lock(&t->tqent_lock);
+
+ /*
+ * Make sure the entry is not on some other taskq; it is important to
+ * ASSERT() under lock
+ */
+ ASSERT(taskq_empty_ent(t));
+
+ /*
+ * Mark it as a prealloc'd task. This is important
+ * to ensure that we don't free it later.
+ */
+ t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+ /* Queue to the priority list instead of the pending list */
+ if (flags & TQ_FRONT)
+ list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+ else
+ list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+ t->tqent_id = tq->tq_next_id;
+ tq->tq_next_id++;
+ t->tqent_func = func;
+ t->tqent_arg = arg;
+ t->tqent_taskq = tq;
+ t->tqent_birth = jiffies;
+
+ spin_unlock(&t->tqent_lock);
+
+ wake_up(&tq->tq_work_waitq);
+out:
+ /* Spawn additional taskq threads if required. */
+ if (tq->tq_nactive == tq->tq_nthreads)
+ (void) taskq_thread_spawn(tq);
+out2:
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+}
+EXPORT_SYMBOL(taskq_dispatch_ent);
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+ return (list_empty(&t->tqent_list));
+}
+EXPORT_SYMBOL(taskq_empty_ent);
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+ spin_lock_init(&t->tqent_lock);
+ init_waitqueue_head(&t->tqent_waitq);
+ timer_setup(&t->tqent_timer, NULL, 0);
+ INIT_LIST_HEAD(&t->tqent_list);
+ t->tqent_id = 0;
+ t->tqent_func = NULL;
+ t->tqent_arg = NULL;
+ t->tqent_flags = 0;
+ t->tqent_taskq = NULL;
+}
+EXPORT_SYMBOL(taskq_init_ent);
+
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+ struct list_head *list;
+
+ if (!list_empty(&tq->tq_prio_list))
+ list = &tq->tq_prio_list;
+ else if (!list_empty(&tq->tq_pend_list))
+ list = &tq->tq_pend_list;
+ else
+ return (NULL);
+
+ return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+ taskq_t *tq = (taskq_t *)arg;
+ unsigned long flags;
+
+ if (taskq_thread_create(tq) == NULL) {
+ /* restore spawning count if failed */
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nspawn--;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
+ * number of threads is insufficient to handle the pending tasks. These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim. The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq)
+{
+ int spawning = 0;
+
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+ (tq->tq_flags & TASKQ_ACTIVE)) {
+ spawning = (++tq->tq_nspawn);
+ taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+ tq, TQ_NOSLEEP);
+ }
+
+ return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks. This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+ if (!(tq->tq_flags & TASKQ_DYNAMIC))
+ return (0);
+
+ if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+ tqt_thread_list) == tqt)
+ return (0);
+
+ return
+ ((tq->tq_nspawn == 0) && /* No threads are being spawned */
+ (tq->tq_nactive == 0) && /* No threads are handling tasks */
+ (tq->tq_nthreads > 1) && /* More than 1 thread is running */
+ (!taskq_next_ent(tq)) && /* There are no pending tasks */
+ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+}
+
+static int
+taskq_thread(void *args)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ sigset_t blocked;
+ taskq_thread_t *tqt = args;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ int seq_tasks = 0;
+ unsigned long flags;
+ taskq_ent_t dup_task = {};
+
+ ASSERT(tqt);
+ ASSERT(tqt->tqt_tq);
+ tq = tqt->tqt_tq;
+ current->flags |= PF_NOFREEZE;
+
+ (void) spl_fstrans_mark();
+
+ sigfillset(&blocked);
+ sigprocmask(SIG_BLOCK, &blocked, NULL);
+ flush_signals(current);
+ kfpu_initialize();
+
+ tsd_set(taskq_tsd, tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /*
+ * If we are dynamically spawned, decrease spawning count. Note that
+ * we could be created during taskq_create, in which case we shouldn't
+ * do the decrement. But it's fine because taskq_create will reset
+ * tq_nspawn later.
+ */
+ if (tq->tq_flags & TASKQ_DYNAMIC)
+ tq->tq_nspawn--;
+
+ /* Immediately exit if more threads than allowed were created. */
+ if (tq->tq_nthreads >= tq->tq_maxthreads)
+ goto error;
+
+ tq->tq_nthreads++;
+ list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
+ wake_up(&tq->tq_wait_waitq);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+
+ if (list_empty(&tq->tq_pend_list) &&
+ list_empty(&tq->tq_prio_list)) {
+
+ if (taskq_thread_should_stop(tq, tqt)) {
+ wake_up_all(&tq->tq_wait_waitq);
+ break;
+ }
+
+ add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ schedule();
+ seq_tasks = 0;
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ remove_wait_queue(&tq->tq_work_waitq, &wait);
+ } else {
+ __set_current_state(TASK_RUNNING);
+ }
+
+ if ((t = taskq_next_ent(tq)) != NULL) {
+ list_del_init(&t->tqent_list);
+
+ /*
+ * A TQENT_FLAG_PREALLOC task may be reused or freed
+ * during the task function call. Store tqent_id and
+ * tqent_flags here.
+ *
+ * Also use an on stack taskq_ent_t for tqt_task
+ * assignment in this case. We only populate the two
+ * fields used by the only user in taskq proc file.
+ */
+ tqt->tqt_id = t->tqent_id;
+ tqt->tqt_flags = t->tqent_flags;
+
+ if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
+ dup_task.tqent_func = t->tqent_func;
+ dup_task.tqent_arg = t->tqent_arg;
+ t = &dup_task;
+ }
+ tqt->tqt_task = t;
+
+ taskq_insert_in_order(tq, tqt);
+ tq->tq_nactive++;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ /* Perform the requested task */
+ t->tqent_func(t->tqent_arg);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ tq->tq_nactive--;
+ list_del_init(&tqt->tqt_active_list);
+ tqt->tqt_task = NULL;
+
+ /* For prealloc'd tasks, we don't free anything. */
+ if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+ task_done(tq, t);
+
+ /*
+ * When the current lowest outstanding taskqid is
+ * done calculate the new lowest outstanding id
+ */
+ if (tq->tq_lowest_id == tqt->tqt_id) {
+ tq->tq_lowest_id = taskq_lowest_id(tq);
+ ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
+ }
+
+ /* Spawn additional taskq threads if required. */
+ if ((++seq_tasks) > spl_taskq_thread_sequential &&
+ taskq_thread_spawn(tq))
+ seq_tasks = 0;
+
+ tqt->tqt_id = TASKQID_INVALID;
+ tqt->tqt_flags = 0;
+ wake_up_all(&tq->tq_wait_waitq);
+ } else {
+ if (taskq_thread_should_stop(tq, tqt))
+ break;
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ }
+
+ __set_current_state(TASK_RUNNING);
+ tq->tq_nthreads--;
+ list_del_init(&tqt->tqt_thread_list);
+error:
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ tsd_set(taskq_tsd, NULL);
+
+ return (0);
+}
+
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+ static int last_used_cpu = 0;
+ taskq_thread_t *tqt;
+
+ tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+ INIT_LIST_HEAD(&tqt->tqt_thread_list);
+ INIT_LIST_HEAD(&tqt->tqt_active_list);
+ tqt->tqt_tq = tq;
+ tqt->tqt_id = TASKQID_INVALID;
+
+ tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+ "%s", tq->tq_name);
+ if (tqt->tqt_thread == NULL) {
+ kmem_free(tqt, sizeof (taskq_thread_t));
+ return (NULL);
+ }
+
+ if (spl_taskq_thread_bind) {
+ last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+ kthread_bind(tqt->tqt_thread, last_used_cpu);
+ }
+
+ if (spl_taskq_thread_priority)
+ set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+ wake_up_process(tqt->tqt_thread);
+
+ return (tqt);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri,
+ int minalloc, int maxalloc, uint_t flags)
+{
+ taskq_t *tq;
+ taskq_thread_t *tqt;
+ int count = 0, rc = 0, i;
+ unsigned long irqflags;
+
+ ASSERT(name != NULL);
+ ASSERT(minalloc >= 0);
+ ASSERT(maxalloc <= INT_MAX);
+ ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
+
+ /* Scale the number of threads using nthreads as a percentage */
+ if (flags & TASKQ_THREADS_CPU_PCT) {
+ ASSERT(nthreads <= 100);
+ ASSERT(nthreads >= 0);
+ nthreads = MIN(nthreads, 100);
+ nthreads = MAX(nthreads, 0);
+ nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
+ }
+
+ tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
+ if (tq == NULL)
+ return (NULL);
+
+ spin_lock_init(&tq->tq_lock);
+ INIT_LIST_HEAD(&tq->tq_thread_list);
+ INIT_LIST_HEAD(&tq->tq_active_list);
+ tq->tq_name = strdup(name);
+ tq->tq_nactive = 0;
+ tq->tq_nthreads = 0;
+ tq->tq_nspawn = 0;
+ tq->tq_maxthreads = nthreads;
+ tq->tq_pri = pri;
+ tq->tq_minalloc = minalloc;
+ tq->tq_maxalloc = maxalloc;
+ tq->tq_nalloc = 0;
+ tq->tq_flags = (flags | TASKQ_ACTIVE);
+ tq->tq_next_id = TASKQID_INITIAL;
+ tq->tq_lowest_id = TASKQID_INITIAL;
+ INIT_LIST_HEAD(&tq->tq_free_list);
+ INIT_LIST_HEAD(&tq->tq_pend_list);
+ INIT_LIST_HEAD(&tq->tq_prio_list);
+ INIT_LIST_HEAD(&tq->tq_delay_list);
+ init_waitqueue_head(&tq->tq_work_waitq);
+ init_waitqueue_head(&tq->tq_wait_waitq);
+ tq->tq_lock_class = TQ_LOCK_GENERAL;
+ INIT_LIST_HEAD(&tq->tq_taskqs);
+
+ if (flags & TASKQ_PREPOPULATE) {
+ spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+ tq->tq_lock_class);
+
+ for (i = 0; i < minalloc; i++)
+ task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
+ &irqflags));
+
+ spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+ }
+
+ if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+ nthreads = 1;
+
+ for (i = 0; i < nthreads; i++) {
+ tqt = taskq_thread_create(tq);
+ if (tqt == NULL)
+ rc = 1;
+ else
+ count++;
+ }
+
+ /* Wait for all threads to be started before potential destroy */
+ wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
+ /*
+ * taskq_thread might have touched nspawn, but we don't want them to
+ * because they're not dynamically spawned. So we reset it to 0
+ */
+ tq->tq_nspawn = 0;
+
+ if (rc) {
+ taskq_destroy(tq);
+ tq = NULL;
+ } else {
+ down_write(&tq_list_sem);
+ tq->tq_instance = taskq_find_by_name(name) + 1;
+ list_add_tail(&tq->tq_taskqs, &tq_list);
+ up_write(&tq_list_sem);
+ }
+
+ return (tq);
+}
+EXPORT_SYMBOL(taskq_create);
+
+void
+taskq_destroy(taskq_t *tq)
+{
+ struct task_struct *thread;
+ taskq_thread_t *tqt;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ASSERT(tq);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ tq->tq_flags &= ~TASKQ_ACTIVE;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ /*
+ * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+ * new worker threads be spawned for dynamic taskq.
+ */
+ if (dynamic_taskq != NULL)
+ taskq_wait_outstanding(dynamic_taskq, 0);
+
+ taskq_wait(tq);
+
+ /* remove taskq from global list used by the kstats */
+ down_write(&tq_list_sem);
+ list_del(&tq->tq_taskqs);
+ up_write(&tq_list_sem);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+ /* wait for spawning threads to insert themselves to the list */
+ while (tq->tq_nspawn) {
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ schedule_timeout_interruptible(1);
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ /*
+ * Signal each thread to exit and block until it does. Each thread
+ * is responsible for removing itself from the list and freeing its
+ * taskq_thread_t. This allows for idle threads to opt to remove
+ * themselves from the taskq. They can be recreated as needed.
+ */
+ while (!list_empty(&tq->tq_thread_list)) {
+ tqt = list_entry(tq->tq_thread_list.next,
+ taskq_thread_t, tqt_thread_list);
+ thread = tqt->tqt_thread;
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ kthread_stop(thread);
+
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ }
+
+ while (!list_empty(&tq->tq_free_list)) {
+ t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+ ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+ list_del_init(&t->tqent_list);
+ task_free(tq, t);
+ }
+
+ ASSERT0(tq->tq_nthreads);
+ ASSERT0(tq->tq_nalloc);
+ ASSERT0(tq->tq_nspawn);
+ ASSERT(list_empty(&tq->tq_thread_list));
+ ASSERT(list_empty(&tq->tq_active_list));
+ ASSERT(list_empty(&tq->tq_free_list));
+ ASSERT(list_empty(&tq->tq_pend_list));
+ ASSERT(list_empty(&tq->tq_prio_list));
+ ASSERT(list_empty(&tq->tq_delay_list));
+
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+ strfree(tq->tq_name);
+ kmem_free(tq, sizeof (taskq_t));
+}
+EXPORT_SYMBOL(taskq_destroy);
+
+
+static unsigned int spl_taskq_kick = 0;
+
+/*
+ * 2.6.36 API Change
+ * module_param_cb is introduced to take kernel_param_ops and
+ * module_param_call is marked as obsolete. Also set and get operations
+ * were changed to take a 'const struct kernel_param *'.
+ */
+static int
+#ifdef module_param_cb
+param_set_taskq_kick(const char *val, const struct kernel_param *kp)
+#else
+param_set_taskq_kick(const char *val, struct kernel_param *kp)
+#endif
+{
+ int ret;
+ taskq_t *tq;
+ taskq_ent_t *t;
+ unsigned long flags;
+
+ ret = param_set_uint(val, kp);
+ if (ret < 0 || !spl_taskq_kick)
+ return (ret);
+ /* reset value */
+ spl_taskq_kick = 0;
+
+ down_read(&tq_list_sem);
+ list_for_each_entry(tq, &tq_list, tq_taskqs) {
+ spin_lock_irqsave_nested(&tq->tq_lock, flags,
+ tq->tq_lock_class);
+ /* Check if the first pending is older than 5 seconds */
+ t = taskq_next_ent(tq);
+ if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
+ (void) taskq_thread_spawn(tq);
+ printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
+ tq->tq_name, tq->tq_instance);
+ }
+ spin_unlock_irqrestore(&tq->tq_lock, flags);
+ }
+ up_read(&tq_list_sem);
+ return (ret);
+}
+
+#ifdef module_param_cb
+static const struct kernel_param_ops param_ops_taskq_kick = {
+ .set = param_set_taskq_kick,
+ .get = param_get_uint,
+};
+module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
+#else
+module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
+ &spl_taskq_kick, 0644);
+#endif
+MODULE_PARM_DESC(spl_taskq_kick,
+ "Write nonzero to kick stuck taskqs to spawn more threads");
+
+int
+spl_taskq_init(void)
+{
+ init_rwsem(&tq_list_sem);
+ tsd_create(&taskq_tsd, NULL);
+
+ system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_taskq == NULL)
+ return (1);
+
+ system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+ if (system_delay_taskq == NULL) {
+ taskq_destroy(system_taskq);
+ return (1);
+ }
+
+ dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+ maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+ if (dynamic_taskq == NULL) {
+ taskq_destroy(system_taskq);
+ taskq_destroy(system_delay_taskq);
+ return (1);
+ }
+
+ /*
+ * This is used to annotate tq_lock, so
+ * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
+ * does not trigger a lockdep warning re: possible recursive locking
+ */
+ dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
+
+ return (0);
+}
+
+void
+spl_taskq_fini(void)
+{
+ taskq_destroy(dynamic_taskq);
+ dynamic_taskq = NULL;
+
+ taskq_destroy(system_delay_taskq);
+ system_delay_taskq = NULL;
+
+ taskq_destroy(system_taskq);
+ system_taskq = NULL;
+
+ tsd_destroy(&taskq_tsd);
+}
diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c
new file mode 100644
index 000000000..29de9252a
--- /dev/null
+++ b/module/os/linux/spl/spl-thread.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Thread Implementation.
+ */
+
+#include <sys/thread.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+#include <sys/simd.h>
+
+/*
+ * Thread interfaces
+ */
+typedef struct thread_priv_s {
+ unsigned long tp_magic; /* Magic */
+ int tp_name_size; /* Name size */
+ char *tp_name; /* Name (without _thread suffix) */
+ void (*tp_func)(void *); /* Registered function */
+ void *tp_args; /* Args to be passed to function */
+ size_t tp_len; /* Len to be passed to function */
+ int tp_state; /* State to start thread at */
+ pri_t tp_pri; /* Priority to start threat at */
+} thread_priv_t;
+
+static int
+thread_generic_wrapper(void *arg)
+{
+ thread_priv_t *tp = (thread_priv_t *)arg;
+ void (*func)(void *);
+ void *args;
+
+ ASSERT(tp->tp_magic == TP_MAGIC);
+ func = tp->tp_func;
+ args = tp->tp_args;
+ set_current_state(tp->tp_state);
+ set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+ kfpu_initialize();
+ kmem_free(tp->tp_name, tp->tp_name_size);
+ kmem_free(tp, sizeof (thread_priv_t));
+
+ if (func)
+ func(args);
+
+ return (0);
+}
+
+void
+__thread_exit(void)
+{
+ tsd_exit();
+ complete_and_exit(NULL, 0);
+ /* Unreachable */
+}
+EXPORT_SYMBOL(__thread_exit);
+
+/*
+ * thread_create() may block forever if it cannot create a thread or
+ * allocate memory. This is preferable to returning a NULL which Solaris
+ * style callers likely never check for... since it can't fail.
+ */
+kthread_t *
+__thread_create(caddr_t stk, size_t stksize, thread_func_t func,
+ const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri)
+{
+ thread_priv_t *tp;
+ struct task_struct *tsk;
+ char *p;
+
+ /* Option pp is simply ignored */
+ /* Variable stack size unsupported */
+ ASSERT(stk == NULL);
+
+ tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
+ if (tp == NULL)
+ return (NULL);
+
+ tp->tp_magic = TP_MAGIC;
+ tp->tp_name_size = strlen(name) + 1;
+
+ tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE);
+ if (tp->tp_name == NULL) {
+ kmem_free(tp, sizeof (thread_priv_t));
+ return (NULL);
+ }
+
+ strncpy(tp->tp_name, name, tp->tp_name_size);
+
+ /*
+ * Strip trailing "_thread" from passed name which will be the func
+ * name since the exposed API has no parameter for passing a name.
+ */
+ p = strstr(tp->tp_name, "_thread");
+ if (p)
+ p[0] = '\0';
+
+ tp->tp_func = func;
+ tp->tp_args = args;
+ tp->tp_len = len;
+ tp->tp_state = state;
+ tp->tp_pri = pri;
+
+ tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp,
+ "%s", tp->tp_name);
+ if (IS_ERR(tsk))
+ return (NULL);
+
+ wake_up_process(tsk);
+ return ((kthread_t *)tsk);
+}
+EXPORT_SYMBOL(__thread_create);
+
+/*
+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for
+ * kthread_create() in which it is not killable and less likely
+ * to return -ENOMEM.
+ */
+struct task_struct *
+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
+{
+ struct task_struct *tsk;
+ va_list args;
+ char name[TASK_COMM_LEN];
+
+ va_start(args, namefmt);
+ vsnprintf(name, sizeof (name), namefmt, args);
+ va_end(args);
+ do {
+ tsk = kthread_create(func, data, "%s", name);
+ if (IS_ERR(tsk)) {
+ if (signal_pending(current)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ continue;
+ }
+ if (PTR_ERR(tsk) == -ENOMEM)
+ continue;
+ return (NULL);
+ } else {
+ return (tsk);
+ }
+ } while (1);
+}
+EXPORT_SYMBOL(spl_kthread_create);
diff --git a/module/os/linux/spl/spl-tsd.c b/module/os/linux/spl/spl-tsd.c
new file mode 100644
index 000000000..14342d5a6
--- /dev/null
+++ b/module/os/linux/spl/spl-tsd.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Solaris Porting Layer (SPL) Thread Specific Data Implementation.
+ *
+ * Thread specific data has implemented using a hash table, this avoids
+ * the need to add a member to the task structure and allows maximum
+ * portability between kernels. This implementation has been optimized
+ * to keep the tsd_set() and tsd_get() times as small as possible.
+ *
+ * The majority of the entries in the hash table are for specific tsd
+ * entries. These entries are hashed by the product of their key and
+ * pid because by design the key and pid are guaranteed to be unique.
+ * Their product also has the desirable properly that it will be uniformly
+ * distributed over the hash bins providing neither the pid nor key is zero.
+ * Under linux the zero pid is always the init process and thus won't be
+ * used, and this implementation is careful to never to assign a zero key.
+ * By default the hash table is sized to 512 bins which is expected to
+ * be sufficient for light to moderate usage of thread specific data.
+ *
+ * The hash table contains two additional type of entries. They first
+ * type is entry is called a 'key' entry and it is added to the hash during
+ * tsd_create(). It is used to store the address of the destructor function
+ * and it is used as an anchor point. All tsd entries which use the same
+ * key will be linked to this entry. This is used during tsd_destroy() to
+ * quickly call the destructor function for all tsd associated with the key.
+ * The 'key' entry may be looked up with tsd_hash_search() by passing the
+ * key you wish to lookup and DTOR_PID constant as the pid.
+ *
+ * The second type of entry is called a 'pid' entry and it is added to the
+ * hash the first time a process set a key. The 'pid' entry is also used
+ * as an anchor and all tsd for the process will be linked to it. This
+ * list is using during tsd_exit() to ensure all registered destructors
+ * are run for the process. The 'pid' entry may be looked up with
+ * tsd_hash_search() by passing the PID_KEY constant as the key, and
+ * the process pid. Note that tsd_exit() is called by thread_exit()
+ * so if your using the Solaris thread API you should not need to call
+ * tsd_exit() directly.
+ *
+ */
+
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/tsd.h>
+#include <linux/hash.h>
+
+typedef struct tsd_hash_bin {
+ spinlock_t hb_lock;
+ struct hlist_head hb_head;
+} tsd_hash_bin_t;
+
+typedef struct tsd_hash_table {
+ spinlock_t ht_lock;
+ uint_t ht_bits;
+ uint_t ht_key;
+ tsd_hash_bin_t *ht_bins;
+} tsd_hash_table_t;
+
+typedef struct tsd_hash_entry {
+ uint_t he_key;
+ pid_t he_pid;
+ dtor_func_t he_dtor;
+ void *he_value;
+ struct hlist_node he_list;
+ struct list_head he_key_list;
+ struct list_head he_pid_list;
+} tsd_hash_entry_t;
+
+static tsd_hash_table_t *tsd_hash_table = NULL;
+
+
+/*
+ * tsd_hash_search - searches hash table for tsd_hash_entry
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static tsd_hash_entry_t *
+tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
+{
+ struct hlist_node *node;
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+ hlist_for_each(node, &bin->hb_head) {
+ entry = list_entry(node, tsd_hash_entry_t, he_list);
+ if ((entry->he_key == key) && (entry->he_pid == pid)) {
+ spin_unlock(&bin->hb_lock);
+ return (entry);
+ }
+ }
+
+ spin_unlock(&bin->hb_lock);
+ return (NULL);
+}
+
+/*
+ * tsd_hash_dtor - call the destructor and free all entries on the list
+ * @work: list of hash entries
+ *
+ * For a list of entries which have all already been removed from the
+ * hash call their registered destructor then free the associated memory.
+ */
+static void
+tsd_hash_dtor(struct hlist_head *work)
+{
+ tsd_hash_entry_t *entry;
+
+ while (!hlist_empty(work)) {
+ entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
+ hlist_del(&entry->he_list);
+
+ if (entry->he_dtor && entry->he_pid != DTOR_PID)
+ entry->he_dtor(entry->he_value);
+
+ kmem_free(entry, sizeof (tsd_hash_entry_t));
+ }
+}
+
+/*
+ * tsd_hash_add - adds an entry to hash table
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ *
+ * The caller is responsible for ensuring the unique key/pid do not
+ * already exist in the hash table. This possible because all entries
+ * are thread specific thus a concurrent thread will never attempt to
+ * add this key/pid. Because multiple bins must be checked to add
+ * links to the dtor and pid entries the entire table is locked.
+ */
+static int
+tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
+{
+ tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int rc = 0;
+
+ ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+
+ /* New entry allocate structure, set value, and add to hash */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ entry->he_key = key;
+ entry->he_pid = pid;
+ entry->he_value = value;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ spin_lock(&table->ht_lock);
+
+ /* Destructor entry must exist for all valid keys */
+ dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
+ ASSERT3P(dtor_entry, !=, NULL);
+ entry->he_dtor = dtor_entry->he_dtor;
+
+ /* Process entry must exist for all valid processes */
+ pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
+ ASSERT3P(pid_entry, !=, NULL);
+
+ hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ /* Add to the hash, key, and pid lists */
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+ list_add(&entry->he_key_list, &dtor_entry->he_key_list);
+ list_add(&entry->he_pid_list, &pid_entry->he_pid_list);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (rc);
+}
+
+/*
+ * tsd_hash_add_key - adds a destructor entry to the hash table
+ * @table: hash table
+ * @keyp: search key
+ * @dtor: key destructor
+ *
+ * For every unique key there is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this key are linked
+ * to this anchor via the 'he_key_list' list head. On return they keyp
+ * will be set to the next available key for the hash table.
+ */
+static int
+tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
+{
+ tsd_hash_entry_t *tmp_entry, *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+ int keys_checked = 0;
+
+ ASSERT3P(table, !=, NULL);
+
+ /* Allocate entry to be used as a destructor for this key */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ /* Determine next available key value */
+ spin_lock(&table->ht_lock);
+ do {
+ /* Limited to TSD_KEYS_MAX concurrent unique keys */
+ if (table->ht_key++ > TSD_KEYS_MAX)
+ table->ht_key = 1;
+
+ /* Ensure failure when all TSD_KEYS_MAX keys are in use */
+ if (keys_checked++ >= TSD_KEYS_MAX) {
+ spin_unlock(&table->ht_lock);
+ return (ENOENT);
+ }
+
+ tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
+ } while (tmp_entry);
+
+ /* Add destructor entry in to hash table */
+ entry->he_key = *keyp = table->ht_key;
+ entry->he_pid = DTOR_PID;
+ entry->he_dtor = dtor;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_add_pid - adds a process entry to the hash table
+ * @table: hash table
+ * @pid: search pid
+ *
+ * For every process there is a single entry in the hash which is used
+ * as anchor. All other thread specific entries for this process are
+ * linked to this anchor via the 'he_pid_list' list head.
+ */
+static int
+tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
+{
+ tsd_hash_entry_t *entry;
+ tsd_hash_bin_t *bin;
+ ulong_t hash;
+
+ /* Allocate entry to be used as the process reference */
+ entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+ if (entry == NULL)
+ return (ENOMEM);
+
+ spin_lock(&table->ht_lock);
+ entry->he_key = PID_KEY;
+ entry->he_pid = pid;
+ entry->he_dtor = NULL;
+ entry->he_value = NULL;
+ INIT_HLIST_NODE(&entry->he_list);
+ INIT_LIST_HEAD(&entry->he_key_list);
+ INIT_LIST_HEAD(&entry->he_pid_list);
+
+ hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
+ bin = &table->ht_bins[hash];
+ spin_lock(&bin->hb_lock);
+
+ hlist_add_head(&entry->he_list, &bin->hb_head);
+
+ spin_unlock(&bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ return (0);
+}
+
+/*
+ * tsd_hash_del - delete an entry from hash table, key, and pid lists
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static void
+tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
+{
+ hlist_del(&entry->he_list);
+ list_del_init(&entry->he_key_list);
+ list_del_init(&entry->he_pid_list);
+}
+
+/*
+ * tsd_hash_table_init - allocate a hash table
+ * @bits: hash table size
+ *
+ * A hash table with 2^bits bins will be created, it may not be resized
+ * after the fact and must be free'd with tsd_hash_table_fini().
+ */
+static tsd_hash_table_t *
+tsd_hash_table_init(uint_t bits)
+{
+ tsd_hash_table_t *table;
+ int hash, size = (1 << bits);
+
+ table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
+ if (table == NULL)
+ return (NULL);
+
+ table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
+ if (table->ht_bins == NULL) {
+ kmem_free(table, sizeof (tsd_hash_table_t));
+ return (NULL);
+ }
+
+ for (hash = 0; hash < size; hash++) {
+ spin_lock_init(&table->ht_bins[hash].hb_lock);
+ INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
+ }
+
+ spin_lock_init(&table->ht_lock);
+ table->ht_bits = bits;
+ table->ht_key = 1;
+
+ return (table);
+}
+
+/*
+ * tsd_hash_table_fini - free a hash table
+ * @table: hash table
+ *
+ * Free a hash table allocated by tsd_hash_table_init(). If the hash
+ * table is not empty this function will call the proper destructor for
+ * all remaining entries before freeing the memory used by those entries.
+ */
+static void
+tsd_hash_table_fini(tsd_hash_table_t *table)
+{
+ HLIST_HEAD(work);
+ tsd_hash_bin_t *bin;
+ tsd_hash_entry_t *entry;
+ int size, i;
+
+ ASSERT3P(table, !=, NULL);
+ spin_lock(&table->ht_lock);
+ for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
+ bin = &table->ht_bins[i];
+ spin_lock(&bin->hb_lock);
+ while (!hlist_empty(&bin->hb_head)) {
+ entry = hlist_entry(bin->hb_head.first,
+ tsd_hash_entry_t, he_list);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ }
+ spin_unlock(&bin->hb_lock);
+ }
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+ kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+ ASSERT3P(entry, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ /* save the possible pid_entry */
+ pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+ he_pid_list);
+
+ /* remove entry */
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+
+ /* if pid_entry is indeed pid_entry, then remove it if it's empty */
+ if (pid_entry->he_key == PID_KEY &&
+ list_empty(&pid_entry->he_pid_list)) {
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ }
+
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+
+/*
+ * tsd_set - set thread specific data
+ * @key: lookup key
+ * @value: value to set
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(), protected
+ * from racing tsd_get() or tsd_set() because it is thread specific.
+ * This function has been optimized to be fast for the update case.
+ * When setting the tsd initially it will be slower due to additional
+ * required locking and potential memory allocations.
+ */
+int
+tsd_set(uint_t key, void *value)
+{
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *entry;
+ pid_t pid;
+ int rc;
+ /* mark remove if value is NULL */
+ boolean_t remove = (value == NULL);
+
+ table = tsd_hash_table;
+ pid = curthread->pid;
+ ASSERT3P(table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (EINVAL);
+
+ /* Entry already exists in hash table update value */
+ entry = tsd_hash_search(table, key, pid);
+ if (entry) {
+ entry->he_value = value;
+ /* remove the entry */
+ if (remove)
+ tsd_remove_entry(entry);
+ return (0);
+ }
+
+ /* don't create entry if value is NULL */
+ if (remove)
+ return (0);
+
+ /* Add a process entry to the hash if not yet exists */
+ entry = tsd_hash_search(table, PID_KEY, pid);
+ if (entry == NULL) {
+ rc = tsd_hash_add_pid(table, pid);
+ if (rc)
+ return (rc);
+ }
+
+ rc = tsd_hash_add(table, key, pid, value);
+ return (rc);
+}
+EXPORT_SYMBOL(tsd_set);
+
+/*
+ * tsd_get - get thread specific data
+ * @key: lookup key
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get(uint_t key)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get);
+
+/*
+ * tsd_get_by_thread - get thread specific data for specified thread
+ * @key: lookup key
+ * @thread: thread to lookup
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(). This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get_by_thread(uint_t key, kthread_t *thread)
+{
+ tsd_hash_entry_t *entry;
+
+ ASSERT3P(tsd_hash_table, !=, NULL);
+
+ if ((key == 0) || (key > TSD_KEYS_MAX))
+ return (NULL);
+
+ entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
+ if (entry == NULL)
+ return (NULL);
+
+ return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get_by_thread);
+
+/*
+ * tsd_create - create thread specific data key
+ * @keyp: lookup key address
+ * @dtor: destructor called during tsd_destroy() or tsd_exit()
+ *
+ * Provided key must be set to 0 or it assumed to be already in use.
+ * The dtor is allowed to be NULL in which case no additional cleanup
+ * for the data is performed during tsd_destroy() or tsd_exit().
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_create(uint_t *keyp, dtor_func_t dtor)
+{
+ ASSERT3P(keyp, !=, NULL);
+ if (*keyp)
+ return;
+
+ (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+}
+EXPORT_SYMBOL(tsd_create);
+
+/*
+ * tsd_destroy - destroy thread specific data
+ * @keyp: lookup key address
+ *
+ * Destroys the thread specific data on all threads which use this key.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_destroy(uint_t *keyp)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *dtor_entry, *entry;
+ tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
+ if (dtor_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All threads which use this key must be linked off of the
+ * DTOR_PID entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+ while (!list_empty(&dtor_entry->he_key_list)) {
+ entry = list_entry(dtor_entry->he_key_list.next,
+ tsd_hash_entry_t, he_key_list);
+ ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
+ ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)dtor_entry->he_key *
+ (ulong_t)dtor_entry->he_pid, table->ht_bits);
+ dtor_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&dtor_entry_bin->hb_lock);
+ tsd_hash_del(table, dtor_entry);
+ hlist_add_head(&dtor_entry->he_list, &work);
+ spin_unlock(&dtor_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+ *keyp = 0;
+}
+EXPORT_SYMBOL(tsd_destroy);
+
+/*
+ * tsd_exit - destroys all thread specific data for this thread
+ *
+ * Destroys all the thread specific data for this thread.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_exit(void)
+{
+ HLIST_HEAD(work);
+ tsd_hash_table_t *table;
+ tsd_hash_entry_t *pid_entry, *entry;
+ tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+ ulong_t hash;
+
+ table = tsd_hash_table;
+ ASSERT3P(table, !=, NULL);
+
+ spin_lock(&table->ht_lock);
+ pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
+ if (pid_entry == NULL) {
+ spin_unlock(&table->ht_lock);
+ return;
+ }
+
+ /*
+ * All keys associated with this pid must be linked off of the
+ * PID_KEY entry. They are removed from the hash table and
+ * linked in to a private working list to be destroyed.
+ */
+
+ while (!list_empty(&pid_entry->he_pid_list)) {
+ entry = list_entry(pid_entry->he_pid_list.next,
+ tsd_hash_entry_t, he_pid_list);
+ ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
+
+ hash = hash_long((ulong_t)entry->he_key *
+ (ulong_t)entry->he_pid, table->ht_bits);
+ entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&entry_bin->hb_lock);
+ tsd_hash_del(table, entry);
+ hlist_add_head(&entry->he_list, &work);
+ spin_unlock(&entry_bin->hb_lock);
+ }
+
+ hash = hash_long((ulong_t)pid_entry->he_key *
+ (ulong_t)pid_entry->he_pid, table->ht_bits);
+ pid_entry_bin = &table->ht_bins[hash];
+
+ spin_lock(&pid_entry_bin->hb_lock);
+ tsd_hash_del(table, pid_entry);
+ hlist_add_head(&pid_entry->he_list, &work);
+ spin_unlock(&pid_entry_bin->hb_lock);
+ spin_unlock(&table->ht_lock);
+
+ tsd_hash_dtor(&work);
+}
+EXPORT_SYMBOL(tsd_exit);
+
+int
+spl_tsd_init(void)
+{
+ tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
+ if (tsd_hash_table == NULL)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_tsd_fini(void)
+{
+ tsd_hash_table_fini(tsd_hash_table);
+ tsd_hash_table = NULL;
+}
diff --git a/module/os/linux/spl/spl-vmem.c b/module/os/linux/spl/spl-vmem.c
new file mode 100644
index 000000000..e1a84a911
--- /dev/null
+++ b/module/os/linux/spl/spl-vmem.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <linux/module.h>
+
+vmem_t *heap_arena = NULL;
+EXPORT_SYMBOL(heap_arena);
+
+vmem_t *zio_alloc_arena = NULL;
+EXPORT_SYMBOL(zio_alloc_arena);
+
+vmem_t *zio_arena = NULL;
+EXPORT_SYMBOL(zio_arena);
+
+#define VMEM_FLOOR_SIZE (4 * 1024 * 1024) /* 4MB floor */
+
+/*
+ * Return approximate virtual memory usage based on these assumptions:
+ *
+ * 1) The major SPL consumer of virtual memory is the kmem cache.
+ * 2) Memory allocated with vmem_alloc() is short lived and can be ignored.
+ * 3) Allow a 4MB floor as a generous pad given normal consumption.
+ * 4) The spl_kmem_cache_sem only contends with cache create/destroy.
+ */
+size_t
+vmem_size(vmem_t *vmp, int typemask)
+{
+ spl_kmem_cache_t *skc;
+ size_t alloc = VMEM_FLOOR_SIZE;
+
+ if ((typemask & VMEM_ALLOC) && (typemask & VMEM_FREE))
+ return (VMALLOC_TOTAL);
+
+
+ down_read(&spl_kmem_cache_sem);
+ list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+ if (skc->skc_flags & KMC_VMEM)
+ alloc += skc->skc_slab_size * skc->skc_slab_total;
+ }
+ up_read(&spl_kmem_cache_sem);
+
+ if (typemask & VMEM_ALLOC)
+ return (MIN(alloc, VMALLOC_TOTAL));
+ else if (typemask & VMEM_FREE)
+ return (MAX(VMALLOC_TOTAL - alloc, 0));
+ else
+ return (0);
+}
+EXPORT_SYMBOL(vmem_size);
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+ ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+ flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+ return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+ return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+ return (spl_kmem_free_debug(buf, size));
+#else
+ return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+int
+spl_vmem_init(void)
+{
+ return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
diff --git a/module/os/linux/spl/spl-vnode.c b/module/os/linux/spl/spl-vnode.c
new file mode 100644
index 000000000..d9056c964
--- /dev/null
+++ b/module/os/linux/spl/spl-vnode.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) Vnode Implementation.
+ */
+
+#include <sys/cred.h>
+#include <sys/vnode.h>
+#include <sys/kmem_cache.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_FDTABLE_HEADER
+#include <linux/fdtable.h>
+#endif
+
+vnode_t *rootdir = (vnode_t *)0xabcd1234;
+EXPORT_SYMBOL(rootdir);
+
+static spl_kmem_cache_t *vn_cache;
+static spl_kmem_cache_t *vn_file_cache;
+
+static spinlock_t vn_file_lock;
+static LIST_HEAD(vn_file_list);
+
+static int
+spl_filp_fallocate(struct file *fp, int mode, loff_t offset, loff_t len)
+{
+ int error = -EOPNOTSUPP;
+
+#ifdef HAVE_FILE_FALLOCATE
+ if (fp->f_op->fallocate)
+ error = fp->f_op->fallocate(fp, mode, offset, len);
+#else
+#ifdef HAVE_INODE_FALLOCATE
+ if (fp->f_dentry && fp->f_dentry->d_inode &&
+ fp->f_dentry->d_inode->i_op->fallocate)
+ error = fp->f_dentry->d_inode->i_op->fallocate(
+ fp->f_dentry->d_inode, mode, offset, len);
+#endif /* HAVE_INODE_FALLOCATE */
+#endif /* HAVE_FILE_FALLOCATE */
+
+ return (error);
+}
+
+static int
+spl_filp_fsync(struct file *fp, int sync)
+{
+#ifdef HAVE_2ARGS_VFS_FSYNC
+ return (vfs_fsync(fp, sync));
+#else
+ return (vfs_fsync(fp, (fp)->f_dentry, sync));
+#endif /* HAVE_2ARGS_VFS_FSYNC */
+}
+
+static ssize_t
+spl_kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_WRITE_PPOS)
+ return (kernel_write(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ ret = vfs_write(file, (__force const char __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+static ssize_t
+spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+ return (kernel_read(file, buf, count, pos));
+#else
+ mm_segment_t saved_fs;
+ ssize_t ret;
+
+ saved_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ ret = vfs_read(file, (void __user *)buf, count, pos);
+
+ set_fs(saved_fs);
+
+ return (ret);
+#endif
+}
+
+vtype_t
+vn_mode_to_vtype(mode_t mode)
+{
+ if (S_ISREG(mode))
+ return (VREG);
+
+ if (S_ISDIR(mode))
+ return (VDIR);
+
+ if (S_ISCHR(mode))
+ return (VCHR);
+
+ if (S_ISBLK(mode))
+ return (VBLK);
+
+ if (S_ISFIFO(mode))
+ return (VFIFO);
+
+ if (S_ISLNK(mode))
+ return (VLNK);
+
+ if (S_ISSOCK(mode))
+ return (VSOCK);
+
+ return (VNON);
+} /* vn_mode_to_vtype() */
+EXPORT_SYMBOL(vn_mode_to_vtype);
+
+mode_t
+vn_vtype_to_mode(vtype_t vtype)
+{
+ if (vtype == VREG)
+ return (S_IFREG);
+
+ if (vtype == VDIR)
+ return (S_IFDIR);
+
+ if (vtype == VCHR)
+ return (S_IFCHR);
+
+ if (vtype == VBLK)
+ return (S_IFBLK);
+
+ if (vtype == VFIFO)
+ return (S_IFIFO);
+
+ if (vtype == VLNK)
+ return (S_IFLNK);
+
+ if (vtype == VSOCK)
+ return (S_IFSOCK);
+
+ return (VNON);
+} /* vn_vtype_to_mode() */
+EXPORT_SYMBOL(vn_vtype_to_mode);
+
+vnode_t *
+vn_alloc(int flag)
+{
+ vnode_t *vp;
+
+ vp = kmem_cache_alloc(vn_cache, flag);
+ if (vp != NULL) {
+ vp->v_file = NULL;
+ vp->v_type = 0;
+ }
+
+ return (vp);
+} /* vn_alloc() */
+EXPORT_SYMBOL(vn_alloc);
+
+void
+vn_free(vnode_t *vp)
+{
+ kmem_cache_free(vn_cache, vp);
+} /* vn_free() */
+EXPORT_SYMBOL(vn_free);
+
+int
+vn_open(const char *path, uio_seg_t seg, int flags, int mode, vnode_t **vpp,
+ int x1, void *x2)
+{
+ struct file *fp;
+ struct kstat stat;
+ int rc, saved_umask = 0;
+ gfp_t saved_gfp;
+ vnode_t *vp;
+
+ ASSERT(flags & (FWRITE | FREAD));
+ ASSERT(seg == UIO_SYSSPACE);
+ ASSERT(vpp);
+ *vpp = NULL;
+
+ if (!(flags & FCREAT) && (flags & FWRITE))
+ flags |= FEXCL;
+
+ /*
+ * Note for filp_open() the two low bits must be remapped to mean:
+ * 01 - read-only -> 00 read-only
+ * 10 - write-only -> 01 write-only
+ * 11 - read-write -> 10 read-write
+ */
+ flags--;
+
+ if (flags & FCREAT)
+ saved_umask = xchg(&current->fs->umask, 0);
+
+ fp = filp_open(path, flags, mode);
+
+ if (flags & FCREAT)
+ (void) xchg(&current->fs->umask, saved_umask);
+
+ if (IS_ERR(fp))
+ return (-PTR_ERR(fp));
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat);
+#else
+ rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat);
+#endif
+ if (rc) {
+ filp_close(fp, 0);
+ return (-rc);
+ }
+
+ vp = vn_alloc(KM_SLEEP);
+ if (!vp) {
+ filp_close(fp, 0);
+ return (ENOMEM);
+ }
+
+ saved_gfp = mapping_gfp_mask(fp->f_mapping);
+ mapping_set_gfp_mask(fp->f_mapping, saved_gfp & ~(__GFP_IO|__GFP_FS));
+
+ mutex_enter(&vp->v_lock);
+ vp->v_type = vn_mode_to_vtype(stat.mode);
+ vp->v_file = fp;
+ vp->v_gfp_mask = saved_gfp;
+ *vpp = vp;
+ mutex_exit(&vp->v_lock);
+
+ return (0);
+} /* vn_open() */
+EXPORT_SYMBOL(vn_open);
+
+int
+vn_openat(const char *path, uio_seg_t seg, int flags, int mode,
+ vnode_t **vpp, int x1, void *x2, vnode_t *vp, int fd)
+{
+ char *realpath;
+ int len, rc;
+
+ ASSERT(vp == rootdir);
+
+ len = strlen(path) + 2;
+ realpath = kmalloc(len, kmem_flags_convert(KM_SLEEP));
+ if (!realpath)
+ return (ENOMEM);
+
+ (void) snprintf(realpath, len, "/%s", path);
+ rc = vn_open(realpath, seg, flags, mode, vpp, x1, x2);
+ kfree(realpath);
+
+ return (rc);
+} /* vn_openat() */
+EXPORT_SYMBOL(vn_openat);
+
+int
+vn_rdwr(uio_rw_t uio, vnode_t *vp, void *addr, ssize_t len, offset_t off,
+ uio_seg_t seg, int ioflag, rlim64_t x2, void *x3, ssize_t *residp)
+{
+ struct file *fp = vp->v_file;
+ loff_t offset = off;
+ int rc;
+
+ ASSERT(uio == UIO_WRITE || uio == UIO_READ);
+ ASSERT(seg == UIO_SYSSPACE);
+ ASSERT((ioflag & ~FAPPEND) == 0);
+
+ if (ioflag & FAPPEND)
+ offset = fp->f_pos;
+
+ if (uio & UIO_WRITE)
+ rc = spl_kernel_write(fp, addr, len, &offset);
+ else
+ rc = spl_kernel_read(fp, addr, len, &offset);
+
+ fp->f_pos = offset;
+
+ if (rc < 0)
+ return (-rc);
+
+ if (residp) {
+ *residp = len - rc;
+ } else {
+ if (rc != len)
+ return (EIO);
+ }
+
+ return (0);
+} /* vn_rdwr() */
+EXPORT_SYMBOL(vn_rdwr);
+
+int
+vn_close(vnode_t *vp, int flags, int x1, int x2, void *x3, void *x4)
+{
+ int rc;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+
+ mapping_set_gfp_mask(vp->v_file->f_mapping, vp->v_gfp_mask);
+ rc = filp_close(vp->v_file, 0);
+ vn_free(vp);
+
+ return (-rc);
+} /* vn_close() */
+EXPORT_SYMBOL(vn_close);
+
+/*
+ * vn_seek() does not actually seek it only performs bounds checking on the
+ * proposed seek. We perform minimal checking and allow vn_rdwr() to catch
+ * anything more serious.
+ */
+int
+vn_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, void *ct)
+{
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+EXPORT_SYMBOL(vn_seek);
+
+int
+vn_getattr(vnode_t *vp, vattr_t *vap, int flags, void *x3, void *x4)
+{
+ struct file *fp;
+ struct kstat stat;
+ int rc;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+ ASSERT(vap);
+
+ fp = vp->v_file;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat, STATX_BASIC_STATS,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&fp->f_path, &stat);
+#else
+ rc = vfs_getattr(fp->f_path.mnt, fp->f_dentry, &stat);
+#endif
+ if (rc)
+ return (-rc);
+
+ vap->va_type = vn_mode_to_vtype(stat.mode);
+ vap->va_mode = stat.mode;
+ vap->va_uid = KUID_TO_SUID(stat.uid);
+ vap->va_gid = KGID_TO_SGID(stat.gid);
+ vap->va_fsid = 0;
+ vap->va_nodeid = stat.ino;
+ vap->va_nlink = stat.nlink;
+ vap->va_size = stat.size;
+ vap->va_blksize = stat.blksize;
+ vap->va_atime = stat.atime;
+ vap->va_mtime = stat.mtime;
+ vap->va_ctime = stat.ctime;
+ vap->va_rdev = stat.rdev;
+ vap->va_nblocks = stat.blocks;
+
+ return (0);
+}
+EXPORT_SYMBOL(vn_getattr);
+
+int
+vn_fsync(vnode_t *vp, int flags, void *x3, void *x4)
+{
+ int datasync = 0;
+ int error;
+ int fstrans;
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+
+ if (flags & FDSYNC)
+ datasync = 1;
+
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ error = -spl_filp_fsync(vp->v_file, datasync);
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ return (error);
+} /* vn_fsync() */
+EXPORT_SYMBOL(vn_fsync);
+
+int vn_space(vnode_t *vp, int cmd, struct flock *bfp, int flag,
+ offset_t offset, void *x6, void *x7)
+{
+ int error = EOPNOTSUPP;
+#ifdef FALLOC_FL_PUNCH_HOLE
+ int fstrans;
+#endif
+
+ if (cmd != F_FREESP || bfp->l_whence != SEEK_SET)
+ return (EOPNOTSUPP);
+
+ ASSERT(vp);
+ ASSERT(vp->v_file);
+ ASSERT(bfp->l_start >= 0 && bfp->l_len > 0);
+
+#ifdef FALLOC_FL_PUNCH_HOLE
+ /*
+ * May enter XFS which generates a warning when PF_FSTRANS is set.
+ * To avoid this the flag is cleared over vfs_sync() and then reset.
+ */
+ fstrans = __spl_pf_fstrans_check();
+ if (fstrans)
+ current->flags &= ~(__SPL_PF_FSTRANS);
+
+ /*
+ * When supported by the underlying file system preferentially
+ * use the fallocate() callback to preallocate the space.
+ */
+ error = -spl_filp_fallocate(vp->v_file,
+ FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
+ bfp->l_start, bfp->l_len);
+
+ if (fstrans)
+ current->flags |= __SPL_PF_FSTRANS;
+
+ if (error == 0)
+ return (0);
+#endif
+
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+ if (vp->v_file->f_dentry && vp->v_file->f_dentry->d_inode &&
+ vp->v_file->f_dentry->d_inode->i_op &&
+ vp->v_file->f_dentry->d_inode->i_op->truncate_range) {
+ off_t end = bfp->l_start + bfp->l_len;
+ /*
+ * Judging from the code in shmem_truncate_range(),
+ * it seems the kernel expects the end offset to be
+ * inclusive and aligned to the end of a page.
+ */
+ if (end % PAGE_SIZE != 0) {
+ end &= ~(off_t)(PAGE_SIZE - 1);
+ if (end <= bfp->l_start)
+ return (0);
+ }
+ --end;
+
+ vp->v_file->f_dentry->d_inode->i_op->truncate_range(
+ vp->v_file->f_dentry->d_inode, bfp->l_start, end);
+
+ return (0);
+ }
+#endif
+
+ return (error);
+}
+EXPORT_SYMBOL(vn_space);
+
+/* Function must be called while holding the vn_file_lock */
+static file_t *
+file_find(int fd, struct task_struct *task)
+{
+ file_t *fp;
+
+ list_for_each_entry(fp, &vn_file_list, f_list) {
+ if (fd == fp->f_fd && fp->f_task == task) {
+ ASSERT(atomic_read(&fp->f_ref) != 0);
+ return (fp);
+ }
+ }
+
+ return (NULL);
+} /* file_find() */
+
+file_t *
+vn_getf(int fd)
+{
+ struct kstat stat;
+ struct file *lfp;
+ file_t *fp;
+ vnode_t *vp;
+ int rc = 0;
+
+ if (fd < 0)
+ return (NULL);
+
+ /* Already open just take an extra reference */
+ spin_lock(&vn_file_lock);
+
+ fp = file_find(fd, current);
+ if (fp) {
+ lfp = fget(fd);
+ fput(fp->f_file);
+ /*
+ * areleasef() can cause us to see a stale reference when
+ * userspace has reused a file descriptor before areleasef()
+ * has run. fput() the stale reference and replace it. We
+ * retain the original reference count such that the concurrent
+ * areleasef() will decrement its reference and terminate.
+ */
+ if (lfp != fp->f_file) {
+ fp->f_file = lfp;
+ fp->f_vnode->v_file = lfp;
+ }
+ atomic_inc(&fp->f_ref);
+ spin_unlock(&vn_file_lock);
+ return (fp);
+ }
+
+ spin_unlock(&vn_file_lock);
+
+ /* File was not yet opened create the object and setup */
+ fp = kmem_cache_alloc(vn_file_cache, KM_SLEEP);
+ if (fp == NULL)
+ goto out;
+
+ mutex_enter(&fp->f_lock);
+
+ fp->f_fd = fd;
+ fp->f_task = current;
+ fp->f_offset = 0;
+ atomic_inc(&fp->f_ref);
+
+ lfp = fget(fd);
+ if (lfp == NULL)
+ goto out_mutex;
+
+ vp = vn_alloc(KM_SLEEP);
+ if (vp == NULL)
+ goto out_fget;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&lfp->f_path, &stat, STATX_TYPE,
+ AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+ rc = vfs_getattr(&lfp->f_path, &stat);
+#else
+ rc = vfs_getattr(lfp->f_path.mnt, lfp->f_dentry, &stat);
+#endif
+ if (rc)
+ goto out_vnode;
+
+ mutex_enter(&vp->v_lock);
+ vp->v_type = vn_mode_to_vtype(stat.mode);
+ vp->v_file = lfp;
+ mutex_exit(&vp->v_lock);
+
+ fp->f_vnode = vp;
+ fp->f_file = lfp;
+
+ /* Put it on the tracking list */
+ spin_lock(&vn_file_lock);
+ list_add(&fp->f_list, &vn_file_list);
+ spin_unlock(&vn_file_lock);
+
+ mutex_exit(&fp->f_lock);
+ return (fp);
+
+out_vnode:
+ vn_free(vp);
+out_fget:
+ fput(lfp);
+out_mutex:
+ mutex_exit(&fp->f_lock);
+ kmem_cache_free(vn_file_cache, fp);
+out:
+ return (NULL);
+} /* getf() */
+EXPORT_SYMBOL(getf);
+
+static void releasef_locked(file_t *fp)
+{
+ ASSERT(fp->f_file);
+ ASSERT(fp->f_vnode);
+
+ /* Unlinked from list, no refs, safe to free outside mutex */
+ fput(fp->f_file);
+ vn_free(fp->f_vnode);
+
+ kmem_cache_free(vn_file_cache, fp);
+}
+
+void
+vn_releasef(int fd)
+{
+ areleasef(fd, P_FINFO(current));
+}
+EXPORT_SYMBOL(releasef);
+
+void
+vn_areleasef(int fd, uf_info_t *fip)
+{
+ file_t *fp;
+ struct task_struct *task = (struct task_struct *)fip;
+
+ if (fd < 0)
+ return;
+
+ spin_lock(&vn_file_lock);
+ fp = file_find(fd, task);
+ if (fp) {
+ atomic_dec(&fp->f_ref);
+ if (atomic_read(&fp->f_ref) > 0) {
+ spin_unlock(&vn_file_lock);
+ return;
+ }
+
+ list_del(&fp->f_list);
+ releasef_locked(fp);
+ }
+ spin_unlock(&vn_file_lock);
+} /* releasef() */
+EXPORT_SYMBOL(areleasef);
+
+static int
+vn_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ struct vnode *vp = buf;
+
+ mutex_init(&vp->v_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ return (0);
+} /* vn_cache_constructor() */
+
+static void
+vn_cache_destructor(void *buf, void *cdrarg)
+{
+ struct vnode *vp = buf;
+
+ mutex_destroy(&vp->v_lock);
+} /* vn_cache_destructor() */
+
+static int
+vn_file_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+ file_t *fp = buf;
+
+ atomic_set(&fp->f_ref, 0);
+ mutex_init(&fp->f_lock, NULL, MUTEX_DEFAULT, NULL);
+ INIT_LIST_HEAD(&fp->f_list);
+
+ return (0);
+} /* vn_file_cache_constructor() */
+
+static void
+vn_file_cache_destructor(void *buf, void *cdrarg)
+{
+ file_t *fp = buf;
+
+ mutex_destroy(&fp->f_lock);
+} /* vn_file_cache_destructor() */
+
+int
+spl_vn_init(void)
+{
+ spin_lock_init(&vn_file_lock);
+
+ vn_cache = kmem_cache_create("spl_vn_cache",
+ sizeof (struct vnode), 64, vn_cache_constructor,
+ vn_cache_destructor, NULL, NULL, NULL, 0);
+
+ vn_file_cache = kmem_cache_create("spl_vn_file_cache",
+ sizeof (file_t), 64, vn_file_cache_constructor,
+ vn_file_cache_destructor, NULL, NULL, NULL, 0);
+
+ return (0);
+} /* spl_vn_init() */
+
+void
+spl_vn_fini(void)
+{
+ file_t *fp, *next_fp;
+ int leaked = 0;
+
+ spin_lock(&vn_file_lock);
+
+ list_for_each_entry_safe(fp, next_fp, &vn_file_list, f_list) {
+ list_del(&fp->f_list);
+ releasef_locked(fp);
+ leaked++;
+ }
+
+ spin_unlock(&vn_file_lock);
+
+ if (leaked > 0)
+ printk(KERN_WARNING "WARNING: %d vnode files leaked\n", leaked);
+
+ kmem_cache_destroy(vn_file_cache);
+ kmem_cache_destroy(vn_cache);
+} /* spl_vn_fini() */
diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c
new file mode 100644
index 000000000..1dd31ffc1
--- /dev/null
+++ b/module/os/linux/spl/spl-xdr.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2008-2010 Sun Microsystems, Inc.
+ * Written by Ricardo Correia <[email protected]>
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Solaris Porting Layer (SPL) XDR Implementation.
+ */
+
+#include <linux/string.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <rpc/xdr.h>
+
+/*
+ * SPL's XDR mem implementation.
+ *
+ * This is used by libnvpair to serialize/deserialize the name-value pair data
+ * structures into byte arrays in a well-defined and portable manner.
+ *
+ * These data structures are used by the DMU/ZFS to flexibly manipulate various
+ * information in memory and later serialize it/deserialize it to disk.
+ * Examples of usages include the pool configuration, lists of pool and dataset
+ * properties, etc.
+ *
+ * Reference documentation for the XDR representation and XDR operations can be
+ * found in RFC 1832 and xdr(3), respectively.
+ *
+ * === Implementation shortcomings ===
+ *
+ * It is assumed that the following C types have the following sizes:
+ *
+ * char/unsigned char: 1 byte
+ * short/unsigned short: 2 bytes
+ * int/unsigned int: 4 bytes
+ * longlong_t/u_longlong_t: 8 bytes
+ *
+ * The C standard allows these types to be larger (and in the case of ints,
+ * shorter), so if that is the case on some compiler/architecture, the build
+ * will fail (on purpose).
+ *
+ * If someone wants to fix the code to work properly on such environments, then:
+ *
+ * 1) Preconditions should be added to xdrmem_enc functions to make sure the
+ * caller doesn't pass arguments which exceed the expected range.
+ * 2) Functions which take signed integers should be changed to properly do
+ * sign extension.
+ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger
+ * problems than this implementation.
+ *
+ * It is also assumed that:
+ *
+ * 1) Chars have 8 bits.
+ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned
+ * memcpy, memset and memcmp.
+ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture
+ * supports element-sized-aligned memory accesses.
+ * 4) Negative integers are natively stored in two's complement binary
+ * representation.
+ *
+ * No checks are done for the 4 assumptions above, though.
+ *
+ * === Caller expectations ===
+ *
+ * Existing documentation does not describe the semantics of XDR operations very
+ * well. Therefore, some assumptions about failure semantics will be made and
+ * will be described below:
+ *
+ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the
+ * the stream should be considered valid only up to the encoding operation
+ * previous to the one that first failed. However, the stream size as returned
+ * by xdr_control() cannot be considered to be strictly correct (it may be
+ * bigger).
+ *
+ * Putting it another way, if there is an encoding failure it's undefined
+ * whether anything is added to the stream in that operation and therefore
+ * neither xdr_control() nor future encoding operations on the same stream can
+ * be relied upon to produce correct results.
+ *
+ * 2) If a decoding operation fails, it's undefined whether anything will be
+ * decoded into passed buffers/pointers during that operation, or what the
+ * values on those buffers will look like.
+ *
+ * Future decoding operations on the same stream will also have similar
+ * undefined behavior.
+ *
+ * 3) When the first decoding operation fails it is OK to trust the results of
+ * previous decoding operations on the same stream, as long as the caller
+ * expects a failure to be possible (e.g. due to end-of-stream).
+ *
+ * However, this is highly discouraged because the caller should know the
+ * stream size and should be coded to expect any decoding failure to be data
+ * corruption due to hardware, accidental or even malicious causes, which should
+ * be handled gracefully in all cases.
+ *
+ * In very rare situations where there are strong reasons to believe the data
+ * can be trusted to be valid and non-tampered with, then the caller may assume
+ * a decoding failure to be a bug (e.g. due to mismatched data types) and may
+ * fail non-gracefully.
+ *
+ * 4) Non-zero padding bytes will cause the decoding operation to fail.
+ *
+ * 5) Zero bytes on string types will also cause the decoding operation to fail.
+ *
+ * 6) It is assumed that either the pointer to the stream buffer given by the
+ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int
+ * memory accesses.
+ *
+ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap.
+ *
+ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user
+ * space or MMIO space), the computer may explode.
+ */
+
+static struct xdr_ops xdrmem_encode_ops;
+static struct xdr_ops xdrmem_decode_ops;
+
+void
+xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+ const enum xdr_op op)
+{
+ switch (op) {
+ case XDR_ENCODE:
+ xdrs->x_ops = &xdrmem_encode_ops;
+ break;
+ case XDR_DECODE:
+ xdrs->x_ops = &xdrmem_decode_ops;
+ break;
+ default:
+ xdrs->x_ops = NULL; /* Let the caller know we failed */
+ return;
+ }
+
+ xdrs->x_op = op;
+ xdrs->x_addr = addr;
+ xdrs->x_addr_end = addr + size;
+
+ if (xdrs->x_addr_end < xdrs->x_addr) {
+ xdrs->x_ops = NULL;
+ }
+}
+EXPORT_SYMBOL(xdrmem_create);
+
+static bool_t
+xdrmem_control(XDR *xdrs, int req, void *info)
+{
+ struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info;
+
+ if (req != XDR_GET_BYTES_AVAIL)
+ return (FALSE);
+
+ rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */
+ rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(xdrs->x_addr, cp, cnt);
+
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ memset(xdrs->x_addr, 0, pad);
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+ static uint32_t zero = 0;
+ uint_t size = roundup(cnt, 4);
+ uint_t pad;
+
+ if (size < cnt)
+ return (FALSE); /* Integer overflow */
+
+ if (xdrs->x_addr > xdrs->x_addr_end)
+ return (FALSE);
+
+ if (xdrs->x_addr_end - xdrs->x_addr < size)
+ return (FALSE);
+
+ memcpy(cp, xdrs->x_addr, cnt);
+ xdrs->x_addr += cnt;
+
+ pad = size - cnt;
+ if (pad > 0) {
+ /* An inverted memchr() would be useful here... */
+ if (memcmp(&zero, xdrs->x_addr, pad) != 0)
+ return (FALSE);
+
+ xdrs->x_addr += pad;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint32(XDR *xdrs, uint32_t val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val);
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_uint32(XDR *xdrs, uint32_t *val)
+{
+ if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+ return (FALSE);
+
+ *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr));
+
+ xdrs->x_addr += sizeof (uint32_t);
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+ val = *((unsigned char *) cp);
+
+ return (xdrmem_enc_uint32(xdrs, val));
+}
+
+static bool_t
+xdrmem_dec_char(XDR *xdrs, char *cp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (char) != 1);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * If any of the 3 other bytes are non-zero then val will be greater
+ * than 0xff and we fail because according to the RFC, this block does
+ * not have a char encoded in it.
+ */
+ if (val > 0xff)
+ return (FALSE);
+
+ *((unsigned char *) cp) = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp)
+{
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ return (xdrmem_enc_uint32(xdrs, *usp));
+}
+
+static bool_t
+xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp)
+{
+ uint32_t val;
+
+ BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+ if (!xdrmem_dec_uint32(xdrs, &val))
+ return (FALSE);
+
+ /*
+ * Short ints are not in the RFC, but we assume similar logic as in
+ * xdrmem_dec_char().
+ */
+ if (val > 0xffff)
+ return (FALSE);
+
+ *usp = val;
+
+ return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_enc_uint32(xdrs, *up));
+}
+
+static bool_t
+xdrmem_dec_uint(XDR *xdrs, unsigned *up)
+{
+ BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+ return (xdrmem_dec_uint32(xdrs, (uint32_t *)up));
+}
+
+static bool_t
+xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_enc_uint32(xdrs, *ullp >> 32))
+ return (FALSE);
+
+ return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff));
+}
+
+static bool_t
+xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+ uint32_t low, high;
+
+ BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+ if (!xdrmem_dec_uint32(xdrs, &high))
+ return (FALSE);
+ if (!xdrmem_dec_uint32(xdrs, &low))
+ return (FALSE);
+
+ *ullp = ((u_longlong_t)high << 32) | low;
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i;
+ caddr_t addr = *arrp;
+
+ if (*sizep > maxsize || *sizep > UINT_MAX / elsize)
+ return (FALSE);
+
+ if (!xdrmem_enc_uint(xdrs, sizep))
+ return (FALSE);
+
+ for (i = 0; i < *sizep; i++) {
+ if (!elproc(xdrs, addr))
+ return (FALSE);
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+ const uint_t elsize, const xdrproc_t elproc)
+{
+ uint_t i, size;
+ bool_t alloc = FALSE;
+ caddr_t addr;
+
+ if (!xdrmem_dec_uint(xdrs, sizep))
+ return (FALSE);
+
+ size = *sizep;
+
+ if (size > maxsize || size > UINT_MAX / elsize)
+ return (FALSE);
+
+ /*
+ * The Solaris man page says: "If *arrp is NULL when decoding,
+ * xdr_array() allocates memory and *arrp points to it".
+ */
+ if (*arrp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *arrp = kmem_alloc(size * elsize, KM_NOSLEEP);
+ if (*arrp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ addr = *arrp;
+
+ for (i = 0; i < size; i++) {
+ if (!elproc(xdrs, addr)) {
+ if (alloc)
+ kmem_free(*arrp, size * elsize);
+ return (FALSE);
+ }
+ addr += elsize;
+ }
+
+ return (TRUE);
+}
+
+static bool_t
+xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ size_t slen = strlen(*sp);
+ uint_t len;
+
+ if (slen > maxsize)
+ return (FALSE);
+
+ len = slen;
+
+ if (!xdrmem_enc_uint(xdrs, &len))
+ return (FALSE);
+
+ return (xdrmem_enc_bytes(xdrs, *sp, len));
+}
+
+static bool_t
+xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+ uint_t size;
+ bool_t alloc = FALSE;
+
+ if (!xdrmem_dec_uint(xdrs, &size))
+ return (FALSE);
+
+ if (size > maxsize || size > UINT_MAX - 1)
+ return (FALSE);
+
+ /*
+ * Solaris man page: "If *sp is NULL when decoding, xdr_string()
+ * allocates memory and *sp points to it".
+ */
+ if (*sp == NULL) {
+ BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+ *sp = kmem_alloc(size + 1, KM_NOSLEEP);
+ if (*sp == NULL)
+ return (FALSE);
+
+ alloc = TRUE;
+ }
+
+ if (!xdrmem_dec_bytes(xdrs, *sp, size))
+ goto fail;
+
+ if (memchr(*sp, 0, size) != NULL)
+ goto fail;
+
+ (*sp)[size] = '\0';
+
+ return (TRUE);
+
+fail:
+ if (alloc)
+ kmem_free(*sp, size + 1);
+
+ return (FALSE);
+}
+
+static struct xdr_ops xdrmem_encode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_enc_char,
+ .xdr_u_short = xdrmem_enc_ushort,
+ .xdr_u_int = xdrmem_enc_uint,
+ .xdr_u_longlong_t = xdrmem_enc_ulonglong,
+ .xdr_opaque = xdrmem_enc_bytes,
+ .xdr_string = xdr_enc_string,
+ .xdr_array = xdr_enc_array
+};
+
+static struct xdr_ops xdrmem_decode_ops = {
+ .xdr_control = xdrmem_control,
+ .xdr_char = xdrmem_dec_char,
+ .xdr_u_short = xdrmem_dec_ushort,
+ .xdr_u_int = xdrmem_dec_uint,
+ .xdr_u_longlong_t = xdrmem_dec_ulonglong,
+ .xdr_opaque = xdrmem_dec_bytes,
+ .xdr_string = xdr_dec_string,
+ .xdr_array = xdr_dec_array
+};
diff --git a/module/os/linux/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c
new file mode 100644
index 000000000..62423343c
--- /dev/null
+++ b/module/os/linux/spl/spl-zlib.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ * Copyright (C) 2007 The Regents of the University of California.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Written by Brian Behlendorf <[email protected]>.
+ * UCRL-CODE-235197
+ *
+ * This file is part of the SPL, Solaris Porting Layer.
+ * For details, see <http://zfsonlinux.org/>.
+ *
+ * The SPL is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * The SPL is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with the SPL. If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * z_compress_level/z_uncompress are nearly identical copies of the
+ * compress2/uncompress functions provided by the official zlib package
+ * available at http://zlib.net/. The only changes made we to slightly
+ * adapt the functions called to match the linux kernel implementation
+ * of zlib. The full zlib license follows:
+ *
+ * zlib.h -- interface of the 'zlib' general purpose compression library
+ * version 1.2.5, April 19th, 2010
+ *
+ * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * Jean-loup Gailly
+ * Mark Adler
+ */
+
+
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+
+static spl_kmem_cache_t *zlib_workspace_cache;
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call. Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use. This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+ return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)));
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+ kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit. sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+ size_t sourceLen, int level)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_deflateInit(&stream, level);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_deflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+ return (err == Z_OK ? Z_BUF_ERROR : err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_deflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_compress_level);
+
+/*
+ * Decompresses the source buffer into the destination buffer. sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Byte *)source;
+ stream.avail_in = (uInt)sourceLen;
+ stream.next_out = dest;
+ stream.avail_out = (uInt)*destLen;
+
+ if ((size_t)stream.avail_out != *destLen)
+ return (Z_BUF_ERROR);
+
+ stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+ if (!stream.workspace)
+ return (Z_MEM_ERROR);
+
+ err = zlib_inflateInit(&stream);
+ if (err != Z_OK) {
+ zlib_workspace_free(stream.workspace);
+ return (err);
+ }
+
+ err = zlib_inflate(&stream, Z_FINISH);
+ if (err != Z_STREAM_END) {
+ zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ if (err == Z_NEED_DICT ||
+ (err == Z_BUF_ERROR && stream.avail_in == 0))
+ return (Z_DATA_ERROR);
+
+ return (err);
+ }
+ *destLen = stream.total_out;
+
+ err = zlib_inflateEnd(&stream);
+ zlib_workspace_free(stream.workspace);
+
+ return (err);
+}
+EXPORT_SYMBOL(z_uncompress);
+
+int
+spl_zlib_init(void)
+{
+ int size;
+
+ size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ zlib_inflate_workspacesize());
+
+ zlib_workspace_cache = kmem_cache_create(
+ "spl_zlib_workspace_cache",
+ size, 0, NULL, NULL, NULL, NULL, NULL,
+ KMC_VMEM);
+ if (!zlib_workspace_cache)
+ return (1);
+
+ return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+ kmem_cache_destroy(zlib_workspace_cache);
+ zlib_workspace_cache = NULL;
+}
diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in
new file mode 100644
index 000000000..84900bd2c
--- /dev/null
+++ b/module/os/linux/zfs/Makefile.in
@@ -0,0 +1,34 @@
+#
+# Linux specific sources included from module/zfs/Makefile.in
+#
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ifeq ($(target_cpu),sparc64)
+ccflags-y += -Wno-unused-value
+endif
+
+ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs
+
+$(MODULE)-objs += ../os/linux/zfs/abd.o
+$(MODULE)-objs += ../os/linux/zfs/policy.o
+$(MODULE)-objs += ../os/linux/zfs/qat.o
+$(MODULE)-objs += ../os/linux/zfs/qat_compress.o
+$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/spa_stats.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_file.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o
+$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_export.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_file.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_super.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o
diff --git a/module/os/linux/zfs/abd.c b/module/os/linux/zfs/abd.c
new file mode 100644
index 000000000..ac6b0b742
--- /dev/null
+++ b/module/os/linux/zfs/abd.c
@@ -0,0 +1,1638 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ * +-------------------+
+ * | ABD (linear) |
+ * | abd_flags = ... |
+ * | abd_size = ... | +--------------------------------+
+ * | abd_buf ------------->| raw buffer of size abd_size |
+ * +-------------------+ +--------------------------------+
+ * no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ * to the chunks recorded in an array at the end of the ABD structure.
+ *
+ * +-------------------+
+ * | ABD (scattered) |
+ * | abd_flags = ... |
+ * | abd_size = ... |
+ * | abd_offset = 0 | +-----------+
+ * | abd_chunks[0] ----------------------------->| chunk 0 |
+ * | abd_chunks[1] ---------------------+ +-----------+
+ * | ... | | +-----------+
+ * | abd_chunks[N-1] ---------+ +------->| chunk 1 |
+ * +-------------------+ | +-----------+
+ * | ...
+ * | +-----------+
+ * +----------------->| chunk N-1 |
+ * +-----------+
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ * (1) They avoid use of kmem_*, preventing performance problems where running
+ * kmem_reap on very large memory systems never finishes and causes
+ * constant TLB shootdowns.
+ *
+ * (2) Fragmentation is less of an issue since when we are at the limit of
+ * allocatable space, we won't have to search around for a long free
+ * hole in the VA space for large ARC allocations. Each chunk is mapped in
+ * individually, so even if we are using HIGHMEM (see next point) we
+ * wouldn't need to worry about finding a contiguous address range.
+ *
+ * (3) If we are not using HIGHMEM, then all physical memory is always
+ * mapped into the kernel's address space, so we also avoid the map /
+ * unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space. See abd_alloc_pages() for details.
+ *
+ * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
+ * B_FALSE.
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ */
+
+#include <sys/abd.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#ifdef _KERNEL
+#include <linux/scatterlist.h>
+#include <linux/kmap_compat.h>
+#else
+#define MAX_ORDER 1
+#endif
+
+typedef struct abd_stats {
+ kstat_named_t abdstat_struct_size;
+ kstat_named_t abdstat_linear_cnt;
+ kstat_named_t abdstat_linear_data_size;
+ kstat_named_t abdstat_scatter_cnt;
+ kstat_named_t abdstat_scatter_data_size;
+ kstat_named_t abdstat_scatter_chunk_waste;
+ kstat_named_t abdstat_scatter_orders[MAX_ORDER];
+ kstat_named_t abdstat_scatter_page_multi_chunk;
+ kstat_named_t abdstat_scatter_page_multi_zone;
+ kstat_named_t abdstat_scatter_page_alloc_retry;
+ kstat_named_t abdstat_scatter_sg_table_retry;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+ /* Amount of memory occupied by all of the abd_t struct allocations */
+ { "struct_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of linear ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset() and abd_get_from_buf()). If an
+ * ABD takes ownership of its buf then it will become tracked.
+ */
+ { "linear_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all linear ABDs tracked by linear_cnt */
+ { "linear_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are currently allocated, excluding
+ * ABDs which don't own their data (for instance the ones which were
+ * allocated through abd_get_offset()).
+ */
+ { "scatter_cnt", KSTAT_DATA_UINT64 },
+ /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+ { "scatter_data_size", KSTAT_DATA_UINT64 },
+ /*
+ * The amount of space wasted at the end of the last chunk across all
+ * scatter ABDs tracked by scatter_cnt.
+ */
+ { "scatter_chunk_waste", KSTAT_DATA_UINT64 },
+ /*
+ * The number of compound allocations of a given order. These
+ * allocations are spread over all currently allocated ABDs, and
+ * act as a measure of memory fragmentation.
+ */
+ { { "scatter_order_N", KSTAT_DATA_UINT64 } },
+ /*
+ * The number of scatter ABDs which contain multiple chunks.
+ * ABDs are preferentially allocated from the minimum number of
+ * contiguous multi-page chunks, a single chunk is optimal.
+ */
+ { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 },
+ /*
+ * The number of scatter ABDs which are split across memory zones.
+ * ABDs are preferentially allocated using pages from a single zone.
+ */
+ { "scatter_page_multi_zone", KSTAT_DATA_UINT64 },
+ /*
+ * The total number of retries encountered when attempting to
+ * allocate the pages to populate the scatter ABD.
+ */
+ { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 },
+ /*
+ * The total number of retries encountered when attempting to
+ * allocate the sg table for an ABD.
+ */
+ { "scatter_sg_table_retry", KSTAT_DATA_UINT64 },
+};
+
+#define ABDSTAT(stat) (abd_stats.stat.value.ui64)
+#define ABDSTAT_INCR(stat, val) \
+ atomic_add_64(&abd_stats.stat.value.ui64, (val))
+#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1)
+#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1)
+
+#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter)
+#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf)
+#define abd_for_each_sg(abd, sg, n, i) \
+ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+/* see block comment above for description */
+int zfs_abd_scatter_enabled = B_TRUE;
+unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
+
+/*
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's. Smaller allocations will use linear ABD's which uses
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page). Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations. This can
+ * improve memory efficiency, but it also makes it much harder for ARC
+ * evictions to actually free pages, because all the buffers on one slab need
+ * to be freed in order for the slab (and underlying pages) to be freed.
+ * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
+ * possible for them to actually waste more memory than scatter (one page per
+ * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
+ *
+ * Spill blocks are typically 512B and are heavily used on systems running
+ * selinux with the default dnode size and the `xattr=sa` property set.
+ *
+ * By default we use linear allocations for 512B and 1KB, and scatter
+ * allocations for larger (1.5KB and up).
+ */
+int zfs_abd_scatter_min_size = 512 * 3;
+
+static kmem_cache_t *abd_cache = NULL;
+static kstat_t *abd_ksp;
+
+static inline size_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+ return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+#ifdef _KERNEL
+/*
+ * Mark zfs data pages so they can be excluded from kernel crash dumps
+ */
+#ifdef _LP64
+#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E
+
+static inline void
+abd_mark_zfs_page(struct page *page)
+{
+ get_page(page);
+ SetPagePrivate(page);
+ set_page_private(page, ABD_FILE_CACHE_PAGE);
+}
+
+static inline void
+abd_unmark_zfs_page(struct page *page)
+{
+ set_page_private(page, 0UL);
+ ClearPagePrivate(page);
+ put_page(page);
+}
+#else
+#define abd_mark_zfs_page(page)
+#define abd_unmark_zfs_page(page)
+#endif /* _LP64 */
+
+#ifndef CONFIG_HIGHMEM
+
+#ifndef __GFP_RECLAIM
+#define __GFP_RECLAIM __GFP_WAIT
+#endif
+
+/*
+ * The goal is to minimize fragmentation by preferentially populating ABDs
+ * with higher order compound pages from a single zone. Allocation size is
+ * progressively decreased until it can be satisfied without performing
+ * reclaim or compaction. When necessary this function will degenerate to
+ * allocating individual pages and allowing reclaim to satisfy allocations.
+ */
+static void
+abd_alloc_pages(abd_t *abd, size_t size)
+{
+ struct list_head pages;
+ struct sg_table table;
+ struct scatterlist *sg;
+ struct page *page, *tmp_page = NULL;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
+ int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
+ int nr_pages = abd_chunkcnt_for_bytes(size);
+ int chunks = 0, zones = 0;
+ size_t remaining_size;
+ int nid = NUMA_NO_NODE;
+ int alloc_pages = 0;
+
+ INIT_LIST_HEAD(&pages);
+
+ while (alloc_pages < nr_pages) {
+ unsigned chunk_pages;
+ int order;
+
+ order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
+ chunk_pages = (1U << order);
+
+ page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+ if (page == NULL) {
+ if (order == 0) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ } else {
+ max_order = MAX(0, order - 1);
+ }
+ continue;
+ }
+
+ list_add_tail(&page->lru, &pages);
+
+ if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
+ zones++;
+
+ nid = page_to_nid(page);
+ ABDSTAT_BUMP(abdstat_scatter_orders[order]);
+ chunks++;
+ alloc_pages += chunk_pages;
+ }
+
+ ASSERT3S(alloc_pages, ==, nr_pages);
+
+ while (sg_alloc_table(&table, chunks, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ sg = table.sgl;
+ remaining_size = size;
+ list_for_each_entry_safe(page, tmp_page, &pages, lru) {
+ size_t sg_size = MIN(PAGESIZE << compound_order(page),
+ remaining_size);
+ sg_set_page(sg, page, sg_size, 0);
+ abd_mark_zfs_page(page);
+ remaining_size -= sg_size;
+
+ sg = sg_next(sg);
+ list_del(&page->lru);
+ }
+
+ /*
+ * These conditions ensure that a possible transformation to a linear
+ * ABD would be valid.
+ */
+ ASSERT(!PageHighMem(sg_page(table.sgl)));
+ ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+ if (table.nents == 1) {
+ /*
+ * Since there is only one entry, this ABD can be represented
+ * as a linear buffer. All single-page (4K) ABD's can be
+ * represented this way. Some multi-page ABD's can also be
+ * represented this way, if we were able to allocate a single
+ * "chunk" (higher-order "page" which represents a power-of-2
+ * series of physically-contiguous pages). This is often the
+ * case for 2-page (8K) ABD's.
+ *
+ * Representing a single-entry scatter ABD as a linear ABD
+ * has the performance advantage of avoiding the copy (and
+ * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+ * A performance increase of around 5% has been observed for
+ * ARC-cached reads (of small blocks which can take advantage
+ * of this).
+ *
+ * Note that this optimization is only possible because the
+ * pages are always mapped into the kernel's address space.
+ * This is not the case for highmem pages, so the
+ * optimization can not be made there.
+ */
+ abd->abd_flags |= ABD_FLAG_LINEAR;
+ abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+ abd->abd_u.abd_linear.abd_sgl = table.sgl;
+ abd->abd_u.abd_linear.abd_buf =
+ page_address(sg_page(table.sgl));
+ } else if (table.nents > 1) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+ if (zones) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
+ abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
+ }
+
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = table.nents;
+ }
+}
+#else
+/*
+ * Allocate N individual pages to construct a scatter ABD. This function
+ * makes no attempt to request contiguous pages and requires the minimal
+ * number of kernel interfaces. It's designed for maximum compatibility.
+ */
+static void
+abd_alloc_pages(abd_t *abd, size_t size)
+{
+ struct scatterlist *sg = NULL;
+ struct sg_table table;
+ struct page *page;
+ gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+ int nr_pages = abd_chunkcnt_for_bytes(size);
+ int i = 0;
+
+ while (sg_alloc_table(&table, nr_pages, gfp)) {
+ ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ ASSERT3U(table.nents, ==, nr_pages);
+ ABD_SCATTER(abd).abd_sgl = table.sgl;
+ ABD_SCATTER(abd).abd_nents = nr_pages;
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ while ((page = __page_cache_alloc(gfp)) == NULL) {
+ ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+ schedule_timeout_interruptible(1);
+ }
+
+ ABDSTAT_BUMP(abdstat_scatter_orders[0]);
+ sg_set_page(sg, page, PAGESIZE, 0);
+ abd_mark_zfs_page(page);
+ }
+
+ if (nr_pages > 1) {
+ ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+ abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+ }
+}
+#endif /* !CONFIG_HIGHMEM */
+
+static void
+abd_free_pages(abd_t *abd)
+{
+ struct scatterlist *sg = NULL;
+ struct sg_table table;
+ struct page *page;
+ int nr_pages = ABD_SCATTER(abd).abd_nents;
+ int order, i = 0;
+
+ if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
+
+ if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
+ ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ page = sg_page(sg);
+ abd_unmark_zfs_page(page);
+ order = compound_order(page);
+ __free_pages(page, order);
+ ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+ }
+
+ table.sgl = ABD_SCATTER(abd).abd_sgl;
+ table.nents = table.orig_nents = nr_pages;
+ sg_free_table(&table);
+}
+
+#else /* _KERNEL */
+
+#ifndef PAGE_SHIFT
+#define PAGE_SHIFT (highbit64(PAGESIZE)-1)
+#endif
+
+struct page;
+
+#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
+#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
+#define local_irq_save(flags) do { (void)(flags); } while (0)
+#define local_irq_restore(flags) do { (void)(flags); } while (0)
+#define nth_page(pg, i) \
+ ((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+ struct page *page;
+ int length;
+ int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr)
+{
+ memset(sg, 0, nr * sizeof (struct scatterlist));
+ sg[nr - 1].end = 1;
+}
+
+#define for_each_sg(sgl, sg, nr, i) \
+ for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+ unsigned int offset)
+{
+ /* currently we don't use offset */
+ ASSERT(offset == 0);
+ sg->page = page;
+ sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+ return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+ if (sg->end)
+ return (NULL);
+
+ return (sg + 1);
+}
+
+static void
+abd_alloc_pages(abd_t *abd, size_t size)
+{
+ unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+ struct scatterlist *sg;
+ int i;
+
+ ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+ sizeof (struct scatterlist), KM_SLEEP);
+ sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+ abd_for_each_sg(abd, sg, nr_pages, i) {
+ struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+ sg_set_page(sg, p, PAGESIZE, 0);
+ }
+ ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+static void
+abd_free_pages(abd_t *abd)
+{
+ int i, n = ABD_SCATTER(abd).abd_nents;
+ struct scatterlist *sg;
+
+ abd_for_each_sg(abd, sg, n, i) {
+ for (int j = 0; j < sg->length; j += PAGESIZE) {
+ struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+ umem_free(p, PAGESIZE);
+ }
+ }
+
+ vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist));
+}
+
+#endif /* _KERNEL */
+
+void
+abd_init(void)
+{
+ int i;
+
+ abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+ sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+ if (abd_ksp != NULL) {
+ abd_ksp->ks_data = &abd_stats;
+ kstat_install(abd_ksp);
+
+ for (i = 0; i < MAX_ORDER; i++) {
+ snprintf(abd_stats.abdstat_scatter_orders[i].name,
+ KSTAT_STRLEN, "scatter_order_%d", i);
+ abd_stats.abdstat_scatter_orders[i].data_type =
+ KSTAT_DATA_UINT64;
+ }
+ }
+}
+
+void
+abd_fini(void)
+{
+ if (abd_ksp != NULL) {
+ kstat_delete(abd_ksp);
+ abd_ksp = NULL;
+ }
+
+ if (abd_cache) {
+ kmem_cache_destroy(abd_cache);
+ abd_cache = NULL;
+ }
+}
+
+static inline void
+abd_verify(abd_t *abd)
+{
+ ASSERT3U(abd->abd_size, >, 0);
+ ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+ ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
+ ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
+ IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+ IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
+ } else {
+ size_t n;
+ int i = 0;
+ struct scatterlist *sg = NULL;
+
+ ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+ ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+ ABD_SCATTER(abd).abd_sgl->length);
+ n = ABD_SCATTER(abd).abd_nents;
+ abd_for_each_sg(abd, sg, n, i) {
+ ASSERT3P(sg_page(sg), !=, NULL);
+ }
+ }
+}
+
+static inline abd_t *
+abd_alloc_struct(void)
+{
+ abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+
+ ASSERT3P(abd, !=, NULL);
+ ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
+
+ return (abd);
+}
+
+static inline void
+abd_free_struct(abd_t *abd)
+{
+ kmem_cache_free(abd_cache, abd);
+ ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+ /* see the comment above zfs_abd_scatter_min_size */
+ if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size)
+ return (abd_alloc_linear(size, is_metadata));
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd_t *abd = abd_alloc_struct();
+ abd->abd_flags = ABD_FLAG_OWNER;
+ abd->abd_u.abd_scatter.abd_offset = 0;
+ abd_alloc_pages(abd, size);
+
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ ABDSTAT_BUMP(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ P2ROUNDUP(size, PAGESIZE) - size);
+
+ return (abd);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+ abd_free_pages(abd);
+
+ zfs_refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+ ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+ ABDSTAT_INCR(abdstat_scatter_chunk_waste,
+ (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE));
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+ abd_t *abd = abd_alloc_struct();
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ if (is_metadata) {
+ abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
+ } else {
+ abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, size);
+
+ return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+ if (abd_is_linear_page(abd)) {
+ /* Transform it back into a scatter ABD for freeing */
+ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+ abd->abd_flags &= ~ABD_FLAG_LINEAR;
+ abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+ ABD_SCATTER(abd).abd_nents = 1;
+ ABD_SCATTER(abd).abd_offset = 0;
+ ABD_SCATTER(abd).abd_sgl = sg;
+ abd_free_scatter(abd);
+ return;
+ }
+ if (abd->abd_flags & ABD_FLAG_META) {
+ zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ } else {
+ zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+
+ abd_free_struct(abd);
+}
+
+/*
+ * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
+ * abd_alloc_linear().
+ */
+void
+abd_free(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT3P(abd->abd_parent, ==, NULL);
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+ if (abd_is_linear(abd))
+ abd_free_linear(abd);
+ else
+ abd_free_scatter(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+ boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+ if (abd_is_linear(sabd) &&
+ !abd_is_linear_page(sabd)) {
+ return (abd_alloc_linear(size, is_metadata));
+ } else {
+ return (abd_alloc(size, is_metadata));
+ }
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy. But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+ return (abd_alloc(size, is_metadata));
+}
+
+/*
+ * Allocate a new ABD to point to offset off of sabd. It shares the underlying
+ * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
+ * any derived ABDs exist.
+ */
+static inline abd_t *
+abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
+{
+ abd_t *abd;
+
+ abd_verify(sabd);
+ ASSERT3U(off, <=, sabd->abd_size);
+
+ if (abd_is_linear(sabd)) {
+ abd = abd_alloc_struct();
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+
+ abd->abd_u.abd_linear.abd_buf =
+ (char *)sabd->abd_u.abd_linear.abd_buf + off;
+ } else {
+ int i = 0;
+ struct scatterlist *sg = NULL;
+ size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
+
+ abd = abd_alloc_struct();
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that
+ * if we own the underlying data buffer, which is not true in
+ * this case. Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = 0;
+
+ abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+ if (new_offset < sg->length)
+ break;
+ new_offset -= sg->length;
+ }
+
+ ABD_SCATTER(abd).abd_sgl = sg;
+ ABD_SCATTER(abd).abd_offset = new_offset;
+ ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+ }
+
+ abd->abd_size = size;
+ abd->abd_parent = sabd;
+ zfs_refcount_create(&abd->abd_children);
+ (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+
+ return (abd);
+}
+
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+ size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
+
+ VERIFY3U(size, >, 0);
+
+ return (abd_get_offset_impl(sabd, off, size));
+}
+
+abd_t *
+abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
+{
+ ASSERT3U(off + size, <=, sabd->abd_size);
+
+ return (abd_get_offset_impl(sabd, off, size));
+}
+
+/*
+ * Allocate a linear ABD structure for buf. You must free this with abd_put()
+ * since the resulting ABD doesn't own its own buffer.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+ abd_t *abd = abd_alloc_struct();
+
+ VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+ /*
+ * Even if this buf is filesystem metadata, we only track that if we
+ * own the underlying data buffer, which is not true in this case.
+ * Therefore, we don't ever use ABD_FLAG_META here.
+ */
+ abd->abd_flags = ABD_FLAG_LINEAR;
+ abd->abd_size = size;
+ abd->abd_parent = NULL;
+ zfs_refcount_create(&abd->abd_children);
+
+ abd->abd_u.abd_linear.abd_buf = buf;
+
+ return (abd);
+}
+
+/*
+ * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
+ * free the underlying scatterlist or buffer.
+ */
+void
+abd_put(abd_t *abd)
+{
+ abd_verify(abd);
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+
+ if (abd->abd_parent != NULL) {
+ (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
+ abd->abd_size, abd);
+ }
+
+ zfs_refcount_destroy(&abd->abd_children);
+ abd_free_struct(abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ abd_verify(abd);
+ return (abd->abd_u.abd_linear.abd_buf);
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+ void *buf;
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ buf = abd_to_buf(abd);
+ } else {
+ buf = zio_buf_alloc(n);
+ }
+ (void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+
+ return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+ void *buf = abd_borrow_buf(abd, n);
+ if (!abd_is_linear(abd)) {
+ abd_copy_to_buf(buf, abd, n);
+ }
+ return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+ abd_verify(abd);
+ ASSERT3U(abd->abd_size, >=, n);
+ if (abd_is_linear(abd)) {
+ ASSERT3P(buf, ==, abd_to_buf(abd));
+ } else {
+ ASSERT0(abd_cmp_buf(abd, buf, n));
+ zio_buf_free(buf, n);
+ }
+ (void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+ if (!abd_is_linear(abd)) {
+ abd_copy_from_buf(abd, buf, n);
+ }
+ abd_return_buf(abd, buf, n);
+}
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+ abd_verify(abd);
+
+ abd->abd_flags |= ABD_FLAG_OWNER;
+ if (is_metadata) {
+ abd->abd_flags |= ABD_FLAG_META;
+ }
+
+ ABDSTAT_BUMP(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+ ASSERT(abd_is_linear(abd));
+ ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+ /*
+ * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+ * Since that flag does not survive the
+ * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+ * abd_take_ownership_of_buf() sequence, we don't allow releasing
+ * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+ */
+ ASSERT(!abd_is_linear_page(abd));
+
+ abd_verify(abd);
+
+ abd->abd_flags &= ~ABD_FLAG_OWNER;
+ /* Disable this flag since we no longer own the data buffer */
+ abd->abd_flags &= ~ABD_FLAG_META;
+
+ ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+ ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+}
+
+#ifndef HAVE_1ARG_KMAP_ATOMIC
+#define NR_KM_TYPE (6)
+#ifdef _KERNEL
+int km_table[NR_KM_TYPE] = {
+ KM_USER0,
+ KM_USER1,
+ KM_BIO_SRC_IRQ,
+ KM_BIO_DST_IRQ,
+ KM_PTE0,
+ KM_PTE1,
+};
+#endif
+#endif
+
+struct abd_iter {
+ /* public interface */
+ void *iter_mapaddr; /* addr corresponding to iter_pos */
+ size_t iter_mapsize; /* length of data valid at mapaddr */
+
+ /* private */
+ abd_t *iter_abd; /* ABD being iterated through */
+ size_t iter_pos;
+ size_t iter_offset; /* offset in current sg/abd_buf, */
+ /* abd_offset included */
+ struct scatterlist *iter_sg; /* current sg */
+#ifndef HAVE_1ARG_KMAP_ATOMIC
+ int iter_km; /* KM_* for kmap_atomic */
+#endif
+};
+
+/*
+ * Initialize the abd_iter.
+ */
+static void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type)
+{
+ abd_verify(abd);
+ aiter->iter_abd = abd;
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+ aiter->iter_pos = 0;
+ if (abd_is_linear(abd)) {
+ aiter->iter_offset = 0;
+ aiter->iter_sg = NULL;
+ } else {
+ aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+ aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+ }
+#ifndef HAVE_1ARG_KMAP_ATOMIC
+ ASSERT3U(km_type, <, NR_KM_TYPE);
+ aiter->iter_km = km_type;
+#endif
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+static void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to advance to, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ aiter->iter_pos += amount;
+ aiter->iter_offset += amount;
+ if (!abd_is_linear(aiter->iter_abd)) {
+ while (aiter->iter_offset >= aiter->iter_sg->length) {
+ aiter->iter_offset -= aiter->iter_sg->length;
+ aiter->iter_sg = sg_next(aiter->iter_sg);
+ if (aiter->iter_sg == NULL) {
+ ASSERT0(aiter->iter_offset);
+ break;
+ }
+ }
+ }
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_map(struct abd_iter *aiter)
+{
+ void *paddr;
+ size_t offset = 0;
+
+ ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+ ASSERT0(aiter->iter_mapsize);
+
+ /* There's nothing left to iterate over, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (abd_is_linear(aiter->iter_abd)) {
+ ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+ offset = aiter->iter_offset;
+ aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+ paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
+ } else {
+ offset = aiter->iter_offset;
+ aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+ aiter->iter_abd->abd_size - aiter->iter_pos);
+
+ paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
+ km_table[aiter->iter_km]);
+ }
+
+ aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+static void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+ /* There's nothing left to unmap, so do nothing */
+ if (aiter->iter_pos == aiter->iter_abd->abd_size)
+ return;
+
+ if (!abd_is_linear(aiter->iter_abd)) {
+ /* LINTED E_FUNC_SET_NOT_USED */
+ zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
+ km_table[aiter->iter_km]);
+ }
+
+ ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+ ASSERT3U(aiter->iter_mapsize, >, 0);
+
+ aiter->iter_mapaddr = NULL;
+ aiter->iter_mapsize = 0;
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+ abd_iter_func_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter aiter;
+
+ abd_verify(abd);
+ ASSERT3U(off + size, <=, abd->abd_size);
+
+ abd_iter_init(&aiter, abd, 0);
+ abd_iter_advance(&aiter, off);
+
+ while (size > 0) {
+ abd_iter_map(&aiter);
+
+ size_t len = MIN(aiter.iter_mapsize, size);
+ ASSERT3U(len, >, 0);
+
+ ret = func(aiter.iter_mapaddr, len, private);
+
+ abd_iter_unmap(&aiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (ret);
+}
+
+struct buf_arg {
+ void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(ba_ptr->arg_buf, buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+ &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+ int ret;
+ struct buf_arg *ba_ptr = private;
+
+ ret = memcmp(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+ struct buf_arg *ba_ptr = private;
+
+ (void) memcpy(buf, ba_ptr->arg_buf, size);
+ ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+ return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+ struct buf_arg ba_ptr = { (void *) buf };
+
+ (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+ &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+ (void) memset(buf, 0, size);
+ return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+ (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+ size_t size, abd_iter_func2_t *func, void *private)
+{
+ int ret = 0;
+ struct abd_iter daiter, saiter;
+
+ abd_verify(dabd);
+ abd_verify(sabd);
+
+ ASSERT3U(doff + size, <=, dabd->abd_size);
+ ASSERT3U(soff + size, <=, sabd->abd_size);
+
+ abd_iter_init(&daiter, dabd, 0);
+ abd_iter_init(&saiter, sabd, 1);
+ abd_iter_advance(&daiter, doff);
+ abd_iter_advance(&saiter, soff);
+
+ while (size > 0) {
+ abd_iter_map(&daiter);
+ abd_iter_map(&saiter);
+
+ size_t dlen = MIN(daiter.iter_mapsize, size);
+ size_t slen = MIN(saiter.iter_mapsize, size);
+ size_t len = MIN(dlen, slen);
+ ASSERT(dlen > 0 || slen > 0);
+
+ ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+ private);
+
+ abd_iter_unmap(&saiter);
+ abd_iter_unmap(&daiter);
+
+ if (ret != 0)
+ break;
+
+ size -= len;
+ abd_iter_advance(&daiter, len);
+ abd_iter_advance(&saiter, len);
+ }
+
+ return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+ (void) memcpy(dbuf, sbuf, size);
+ return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+ (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+ abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+ return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the contents of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd)
+{
+ ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
+ return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
+ abd_cmp_cb, NULL));
+}
+
+/*
+ * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @dabd data ABD. Can be NULL (in this case @dsize = 0)
+ * @func_raidz_gen should be implemented so that its behaviour
+ * is the same when taking linear and when taking scatter
+ */
+void
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+ ssize_t csize, ssize_t dsize, const unsigned parity,
+ void (*func_raidz_gen)(void **, const void *, size_t, size_t))
+{
+ int i;
+ ssize_t len, dlen;
+ struct abd_iter caiters[3];
+ struct abd_iter daiter = {0};
+ void *caddrs[3];
+ unsigned long flags;
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++)
+ abd_iter_init(&caiters[i], cabds[i], i);
+
+ if (dabd)
+ abd_iter_init(&daiter, dabd, i);
+
+ ASSERT3S(dsize, >=, 0);
+
+ local_irq_save(flags);
+ while (csize > 0) {
+ len = csize;
+
+ if (dabd && dsize > 0)
+ abd_iter_map(&daiter);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&caiters[i]);
+ caddrs[i] = caiters[i].iter_mapaddr;
+ }
+
+ switch (parity) {
+ case 3:
+ len = MIN(caiters[2].iter_mapsize, len);
+ /* falls through */
+ case 2:
+ len = MIN(caiters[1].iter_mapsize, len);
+ /* falls through */
+ case 1:
+ len = MIN(caiters[0].iter_mapsize, len);
+ }
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+
+ if (dabd && dsize > 0) {
+ /* this needs precise iter.length */
+ len = MIN(daiter.iter_mapsize, len);
+ dlen = len;
+ } else
+ dlen = 0;
+
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&caiters[i]);
+ abd_iter_advance(&caiters[i], len);
+ }
+
+ if (dabd && dsize > 0) {
+ abd_iter_unmap(&daiter);
+ abd_iter_advance(&daiter, dlen);
+ dsize -= dlen;
+ }
+
+ csize -= len;
+
+ ASSERT3S(dsize, >=, 0);
+ ASSERT3S(csize, >=, 0);
+ }
+ local_irq_restore(flags);
+}
+
+/*
+ * Iterate over code ABDs and data reconstruction target ABDs and call
+ * @func_raidz_rec. Function maps at most 6 pages atomically.
+ *
+ * @cabds parity ABDs, must have equal size
+ * @tabds rec target ABDs, at most 3
+ * @tsize size of data target columns
+ * @func_raidz_rec expects syndrome data in target columns. Function
+ * reconstructs data and overwrites target columns.
+ */
+void
+abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+ ssize_t tsize, const unsigned parity,
+ void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+ const unsigned *mul),
+ const unsigned *mul)
+{
+ int i;
+ ssize_t len;
+ struct abd_iter citers[3];
+ struct abd_iter xiters[3];
+ void *caddrs[3], *xaddrs[3];
+ unsigned long flags;
+
+ ASSERT3U(parity, <=, 3);
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_init(&citers[i], cabds[i], 2*i);
+ abd_iter_init(&xiters[i], tabds[i], 2*i+1);
+ }
+
+ local_irq_save(flags);
+ while (tsize > 0) {
+
+ for (i = 0; i < parity; i++) {
+ abd_iter_map(&citers[i]);
+ abd_iter_map(&xiters[i]);
+ caddrs[i] = citers[i].iter_mapaddr;
+ xaddrs[i] = xiters[i].iter_mapaddr;
+ }
+
+ len = tsize;
+ switch (parity) {
+ case 3:
+ len = MIN(xiters[2].iter_mapsize, len);
+ len = MIN(citers[2].iter_mapsize, len);
+ /* falls through */
+ case 2:
+ len = MIN(xiters[1].iter_mapsize, len);
+ len = MIN(citers[1].iter_mapsize, len);
+ /* falls through */
+ case 1:
+ len = MIN(xiters[0].iter_mapsize, len);
+ len = MIN(citers[0].iter_mapsize, len);
+ }
+ /* must be progressive */
+ ASSERT3S(len, >, 0);
+ /*
+ * The iterated function likely will not do well if each
+ * segment except the last one is not multiple of 512 (raidz).
+ */
+ ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+ func_raidz_rec(xaddrs, len, caddrs, mul);
+
+ for (i = parity-1; i >= 0; i--) {
+ abd_iter_unmap(&xiters[i]);
+ abd_iter_unmap(&citers[i]);
+ abd_iter_advance(&xiters[i], len);
+ abd_iter_advance(&citers[i], len);
+ }
+
+ tsize -= len;
+ ASSERT3S(tsize, >=, 0);
+ }
+ local_irq_restore(flags);
+}
+
+#if defined(_KERNEL)
+/*
+ * bio_nr_pages for ABD.
+ * @off is the offset in @abd
+ */
+unsigned long
+abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
+{
+ unsigned long pos;
+
+ if (abd_is_linear(abd))
+ pos = (unsigned long)abd_to_buf(abd) + off;
+ else
+ pos = abd->abd_u.abd_scatter.abd_offset + off;
+
+ return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
+ (pos >> PAGE_SHIFT);
+}
+
+/*
+ * bio_map for scatter ABD.
+ * @off is the offset in @abd
+ * Remaining IO size is returned
+ */
+unsigned int
+abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
+ unsigned int io_size, size_t off)
+{
+ int i;
+ struct abd_iter aiter;
+
+ ASSERT(!abd_is_linear(abd));
+ ASSERT3U(io_size, <=, abd->abd_size - off);
+
+ abd_iter_init(&aiter, abd, 0);
+ abd_iter_advance(&aiter, off);
+
+ for (i = 0; i < bio->bi_max_vecs; i++) {
+ struct page *pg;
+ size_t len, sgoff, pgoff;
+ struct scatterlist *sg;
+
+ if (io_size <= 0)
+ break;
+
+ sg = aiter.iter_sg;
+ sgoff = aiter.iter_offset;
+ pgoff = sgoff & (PAGESIZE - 1);
+ len = MIN(io_size, PAGESIZE - pgoff);
+ ASSERT(len > 0);
+
+ pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
+ if (bio_add_page(bio, pg, len, pgoff) != len)
+ break;
+
+ io_size -= len;
+ abd_iter_advance(&aiter, len);
+ }
+
+ return (io_size);
+}
+
+/* Tunable Parameters */
+module_param(zfs_abd_scatter_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_enabled,
+ "Toggle whether ABD allocations must be linear.");
+module_param(zfs_abd_scatter_min_size, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_min_size,
+ "Minimum size of scatter allocations.");
+/* CSTYLED */
+module_param(zfs_abd_scatter_max_order, uint, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_max_order,
+ "Maximum order allocation used for a scatter ABD.");
+#endif
diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c
new file mode 100644
index 000000000..7f9456a67
--- /dev/null
+++ b/module/os/linux/zfs/policy.c
@@ -0,0 +1,355 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+ *
+ * For Linux the vast majority of this enforcement is already handled via
+ * the standard Linux VFS permission checks. However certain administrative
+ * commands which bypass the standard mechanisms may need to make use of
+ * this functionality.
+ */
+
+#include <sys/policy.h>
+#include <linux/security.h>
+#include <linux/vfs_compat.h>
+
+/*
+ * The passed credentials cannot be directly verified because Linux only
+ * provides and interface to check the *current* process credentials. In
+ * order to handle this the capable() test is only run when the passed
+ * credentials match the current process credentials or the kcred. In
+ * all other cases this function must fail and return the passed err.
+ */
+static int
+priv_policy_ns(const cred_t *cr, int capability, boolean_t all, int err,
+ struct user_namespace *ns)
+{
+ ASSERT3S(all, ==, B_FALSE);
+
+ if (cr != CRED() && (cr != kcred))
+ return (err);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_NS_CAPABLE)
+ if (!(ns ? ns_capable(ns, capability) : capable(capability)))
+#else
+ if (!capable(capability))
+#endif
+ return (err);
+
+ return (0);
+}
+
+static int
+priv_policy(const cred_t *cr, int capability, boolean_t all, int err)
+{
+ return (priv_policy_ns(cr, capability, all, err, NULL));
+}
+
+static int
+priv_policy_user(const cred_t *cr, int capability, boolean_t all, int err)
+{
+ /*
+ * All priv_policy_user checks are preceded by kuid/kgid_has_mapping()
+ * checks. If we cannot do them, we shouldn't be using ns_capable()
+ * since we don't know whether the affected files are valid in our
+ * namespace. Note that kuid_has_mapping() came after cred->user_ns, so
+ * we shouldn't need to re-check for HAVE_CRED_USER_NS
+ */
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+ return (priv_policy_ns(cr, capability, all, err, cr->user_ns));
+#else
+ return (priv_policy_ns(cr, capability, all, err, NULL));
+#endif
+}
+
+/*
+ * Checks for operations that are either client-only or are used by
+ * both clients and servers.
+ */
+int
+secpolicy_nfs(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM));
+}
+
+/*
+ * Catch all system configuration.
+ */
+int
+secpolicy_sys_config(const cred_t *cr, boolean_t checkonly)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EPERM));
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner,
+ mode_t curmode, mode_t wantmode)
+{
+ return (0);
+}
+
+/*
+ * This is a special routine for ZFS; it is used to determine whether
+ * any of the privileges in effect allow any form of access to the
+ * file. There's no reason to audit this or any reason to record
+ * this. More work is needed to do the "KPLD" stuff.
+ */
+int
+secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+ if (zpl_inode_owner_or_capable(ip))
+ return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ if (priv_policy_user(cr, CAP_DAC_OVERRIDE, B_FALSE, EPERM) == 0)
+ return (0);
+
+ if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, B_FALSE, EPERM) == 0)
+ return (0);
+
+ return (EPERM);
+}
+
+/*
+ * Determine if subject can chown owner of a file.
+ */
+int
+secpolicy_vnode_chown(const cred_t *cr, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM));
+}
+
+/*
+ * Determine if subject can change group ownership of a file.
+ */
+int
+secpolicy_vnode_create_gid(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SETGID, B_FALSE, EPERM));
+}
+
+/*
+ * Policy determines whether we can remove an entry from a directory,
+ * regardless of permission bits.
+ */
+int
+secpolicy_vnode_remove(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_FOWNER, B_FALSE, EPERM));
+}
+
+/*
+ * Determine that subject can modify the mode of a file. allzone privilege
+ * needed when modifying root owned object.
+ */
+int
+secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ return (priv_policy_user(cr, CAP_FOWNER, B_FALSE, EPERM));
+}
+
+/*
+ * Are we allowed to retain the set-uid/set-gid bits when
+ * changing ownership or when writing to a file?
+ * "issuid" should be true when set-uid; only in that case
+ * root ownership is checked (setgid is assumed).
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot)
+{
+ return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
+}
+
+/*
+ * Determine that subject can set the file setgid flag.
+ */
+int
+secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
+{
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+ if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
+ return (EPERM);
+#endif
+ if (crgetfsgid(cr) != gid && !groupmember(gid, cr))
+ return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
+
+ return (0);
+}
+
+/*
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework. Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES));
+}
+
+/*
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools). Equivalent to the SYS_MOUNT privilege.
+ */
+int
+secpolicy_zfs(const cred_t *cr)
+{
+ return (priv_policy(cr, CAP_SYS_ADMIN, B_FALSE, EACCES));
+}
+
+void
+secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
+{
+ if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(cr,
+ (vap->va_mode & S_ISUID) != 0 &&
+ (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
+ vap->va_mask |= AT_MODE;
+ vap->va_mode &= ~(S_ISUID|S_ISGID);
+ }
+}
+
+/*
+ * Determine that subject can set the file setid flags.
+ */
+static int
+secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
+{
+ if (crgetfsuid(cr) == owner)
+ return (0);
+
+#if defined(CONFIG_USER_NS) && defined(HAVE_KUID_HAS_MAPPING)
+ if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+ return (EPERM);
+#endif
+
+ return (priv_policy_user(cr, CAP_FSETID, B_FALSE, EPERM));
+}
+
+/*
+ * Determine that subject can make a file a "sticky".
+ *
+ * Enforced in the Linux VFS.
+ */
+static int
+secpolicy_vnode_stky_modify(const cred_t *cr)
+{
+ return (0);
+}
+
+int
+secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
+ const vattr_t *ovap, cred_t *cr)
+{
+ int error;
+
+ if ((vap->va_mode & S_ISUID) != 0 &&
+ (error = secpolicy_vnode_setid_modify(cr,
+ ovap->va_uid)) != 0) {
+ return (error);
+ }
+
+ /*
+ * Check privilege if attempting to set the
+ * sticky bit on a non-directory.
+ */
+ if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 &&
+ secpolicy_vnode_stky_modify(cr) != 0) {
+ vap->va_mode &= ~S_ISVTX;
+ }
+
+ /*
+ * Check for privilege if attempting to set the
+ * group-id bit.
+ */
+ if ((vap->va_mode & S_ISGID) != 0 &&
+ secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
+ vap->va_mode &= ~S_ISGID;
+ }
+
+ return (0);
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, vtype_t vtype)
+{
+ return (secpolicy_vnode_chown(cr, owner));
+}
+
+/*
+ * Check privileges for setattr attributes.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap,
+ const struct vattr *ovap, int flags,
+ int unlocked_access(void *, int, cred_t *), void *node)
+{
+ return (0);
+}
+
+/*
+ * Check privileges for links.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_basic_link(const cred_t *cr)
+{
+ return (0);
+}
diff --git a/module/os/linux/zfs/qat.c b/module/os/linux/zfs/qat.c
new file mode 100644
index 000000000..a6f024cb4
--- /dev/null
+++ b/module/os/linux/zfs/qat.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <sys/zfs_context.h>
+#include "qat.h"
+
+qat_stats_t qat_stats = {
+ { "comp_requests", KSTAT_DATA_UINT64 },
+ { "comp_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "comp_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "decomp_requests", KSTAT_DATA_UINT64 },
+ { "decomp_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "decomp_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "dc_fails", KSTAT_DATA_UINT64 },
+ { "encrypt_requests", KSTAT_DATA_UINT64 },
+ { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "decrypt_requests", KSTAT_DATA_UINT64 },
+ { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 },
+ { "crypt_fails", KSTAT_DATA_UINT64 },
+ { "cksum_requests", KSTAT_DATA_UINT64 },
+ { "cksum_total_in_bytes", KSTAT_DATA_UINT64 },
+ { "cksum_fails", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *qat_ksp = NULL;
+
+CpaStatus
+qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes)
+{
+ *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL);
+ if (*pp_mem_addr == NULL)
+ return (CPA_STATUS_RESOURCE);
+ return (CPA_STATUS_SUCCESS);
+}
+
+void
+qat_mem_free_contig(void **pp_mem_addr)
+{
+ if (*pp_mem_addr != NULL) {
+ kfree(*pp_mem_addr);
+ *pp_mem_addr = NULL;
+ }
+}
+
+int
+qat_init(void)
+{
+ qat_ksp = kstat_create("zfs", 0, "qat", "misc",
+ KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (qat_ksp != NULL) {
+ qat_ksp->ks_data = &qat_stats;
+ kstat_install(qat_ksp);
+ }
+
+ /*
+ * Just set the disable flag when qat init failed, qat can be
+ * turned on again in post-process after zfs module is loaded, e.g.:
+ * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable
+ */
+ if (qat_dc_init() != 0)
+ zfs_qat_compress_disable = 1;
+
+ if (qat_cy_init() != 0) {
+ zfs_qat_checksum_disable = 1;
+ zfs_qat_encrypt_disable = 1;
+ }
+
+ return (0);
+}
+
+void
+qat_fini(void)
+{
+ if (qat_ksp != NULL) {
+ kstat_delete(qat_ksp);
+ qat_ksp = NULL;
+ }
+
+ qat_cy_fini();
+ qat_dc_fini();
+}
+
+#endif
diff --git a/module/os/linux/zfs/qat_compress.c b/module/os/linux/zfs/qat_compress.c
new file mode 100644
index 000000000..4136b6555
--- /dev/null
+++ b/module/os/linux/zfs/qat_compress.c
@@ -0,0 +1,574 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include "qat.h"
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instance and
+ * session arrays; the actual number of instances are defined in the
+ * QAT driver's configuration file.
+ */
+#define QAT_DC_MAX_INSTANCES 48
+
+/*
+ * ZLIB head and foot size
+ */
+#define ZLIB_HEAD_SZ 2
+#define ZLIB_FOOT_SZ 4
+
+static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES];
+static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES];
+static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES];
+static Cpa16U num_inst = 0;
+static Cpa32U inst_num = 0;
+static boolean_t qat_dc_init_done = B_FALSE;
+int zfs_qat_compress_disable = 0;
+
+boolean_t
+qat_dc_use_accel(size_t s_len)
+{
+ return (!zfs_qat_compress_disable &&
+ qat_dc_init_done &&
+ s_len >= QAT_MIN_BUF_SIZE &&
+ s_len <= QAT_MAX_BUF_SIZE);
+}
+
+static void
+qat_dc_callback(void *p_callback, CpaStatus status)
+{
+ if (p_callback != NULL)
+ complete((struct completion *)p_callback);
+}
+
+static void
+qat_dc_clean(void)
+{
+ Cpa16U buff_num = 0;
+ Cpa16U num_inter_buff_lists = 0;
+
+ for (Cpa16U i = 0; i < num_inst; i++) {
+ cpaDcStopInstance(dc_inst_handles[i]);
+ QAT_PHYS_CONTIG_FREE(session_handles[i]);
+ /* free intermediate buffers */
+ if (buffer_array[i] != NULL) {
+ cpaDcGetNumIntermediateBuffers(
+ dc_inst_handles[i], &num_inter_buff_lists);
+ for (buff_num = 0; buff_num < num_inter_buff_lists;
+ buff_num++) {
+ CpaBufferList *buffer_inter =
+ buffer_array[i][buff_num];
+ if (buffer_inter->pBuffers) {
+ QAT_PHYS_CONTIG_FREE(
+ buffer_inter->pBuffers->pData);
+ QAT_PHYS_CONTIG_FREE(
+ buffer_inter->pBuffers);
+ }
+ QAT_PHYS_CONTIG_FREE(
+ buffer_inter->pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(buffer_inter);
+ }
+ }
+ }
+
+ num_inst = 0;
+ qat_dc_init_done = B_FALSE;
+}
+
+int
+qat_dc_init(void)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U sess_size = 0;
+ Cpa32U ctx_size = 0;
+ Cpa16U num_inter_buff_lists = 0;
+ Cpa16U buff_num = 0;
+ Cpa32U buff_meta_size = 0;
+ CpaDcSessionSetupData sd = {0};
+
+ if (qat_dc_init_done)
+ return (0);
+
+ status = cpaDcGetNumInstances(&num_inst);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ /* if the user has configured no QAT compression units just return */
+ if (num_inst == 0)
+ return (0);
+
+ if (num_inst > QAT_DC_MAX_INSTANCES)
+ num_inst = QAT_DC_MAX_INSTANCES;
+
+ status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ for (Cpa16U i = 0; i < num_inst; i++) {
+ cpaDcSetAddressTranslation(dc_inst_handles[i],
+ (void*)virt_to_phys);
+
+ status = cpaDcBufferListGetMetaSize(dc_inst_handles[i],
+ 1, &buff_meta_size);
+
+ if (status == CPA_STATUS_SUCCESS)
+ status = cpaDcGetNumIntermediateBuffers(
+ dc_inst_handles[i], &num_inter_buff_lists);
+
+ if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0)
+ status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i],
+ num_inter_buff_lists *
+ sizeof (CpaBufferList *));
+
+ for (buff_num = 0; buff_num < num_inter_buff_lists;
+ buff_num++) {
+ if (status == CPA_STATUS_SUCCESS)
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num],
+ sizeof (CpaBufferList));
+
+ if (status == CPA_STATUS_SUCCESS)
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num]->
+ pPrivateMetaData,
+ buff_meta_size);
+
+ if (status == CPA_STATUS_SUCCESS)
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num]->pBuffers,
+ sizeof (CpaFlatBuffer));
+
+ if (status == CPA_STATUS_SUCCESS) {
+ /*
+ * implementation requires an intermediate
+ * buffer approximately twice the size of
+ * output buffer, which is 2x max buffer
+ * size here.
+ */
+ status = QAT_PHYS_CONTIG_ALLOC(
+ &buffer_array[i][buff_num]->pBuffers->
+ pData, 2 * QAT_MAX_BUF_SIZE);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ buffer_array[i][buff_num]->numBuffers = 1;
+ buffer_array[i][buff_num]->pBuffers->
+ dataLenInBytes = 2 * QAT_MAX_BUF_SIZE;
+ }
+ }
+
+ status = cpaDcStartInstance(dc_inst_handles[i],
+ num_inter_buff_lists, buffer_array[i]);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ sd.compLevel = CPA_DC_L1;
+ sd.compType = CPA_DC_DEFLATE;
+ sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
+ sd.sessDirection = CPA_DC_DIR_COMBINED;
+ sd.sessState = CPA_DC_STATELESS;
+ sd.deflateWindowSize = 7;
+ sd.checksum = CPA_DC_ADLER32;
+ status = cpaDcGetSessionSize(dc_inst_handles[i],
+ &sd, &sess_size, &ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size);
+ if (session_handles[i] == NULL)
+ goto fail;
+
+ status = cpaDcInitSession(dc_inst_handles[i],
+ session_handles[i],
+ &sd, NULL, qat_dc_callback);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ }
+
+ qat_dc_init_done = B_TRUE;
+ return (0);
+fail:
+ qat_dc_clean();
+ return (-1);
+}
+
+void
+qat_dc_fini(void)
+{
+ if (!qat_dc_init_done)
+ return;
+
+ qat_dc_clean();
+}
+
+/*
+ * The "add" parameter is an additional buffer which is passed
+ * to QAT as a scratch buffer alongside the destination buffer
+ * in case the "compressed" data ends up being larger than the
+ * original source data. This is necessary to prevent QAT from
+ * generating buffer overflow warnings for incompressible data.
+ */
+static int
+qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
+ char *dst, int dst_len, char *add, int add_len, size_t *c_len)
+{
+ CpaInstanceHandle dc_inst_handle;
+ CpaDcSessionHandle session_handle;
+ CpaBufferList *buf_list_src = NULL;
+ CpaBufferList *buf_list_dst = NULL;
+ CpaFlatBuffer *flat_buf_src = NULL;
+ CpaFlatBuffer *flat_buf_dst = NULL;
+ Cpa8U *buffer_meta_src = NULL;
+ Cpa8U *buffer_meta_dst = NULL;
+ Cpa32U buffer_meta_size = 0;
+ CpaDcRqResults dc_results;
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U hdr_sz = 0;
+ Cpa32U compressed_sz;
+ Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2;
+ Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2;
+ Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2;
+ Cpa32U bytes_left;
+ Cpa32U dst_pages = 0;
+ Cpa32U adler32 = 0;
+ char *data;
+ struct page *page;
+ struct page **in_pages = NULL;
+ struct page **out_pages = NULL;
+ struct page **add_pages = NULL;
+ Cpa32U page_off = 0;
+ struct completion complete;
+ Cpa32U page_num = 0;
+ Cpa16U i;
+
+ /*
+ * We increment num_src_buf and num_dst_buf by 2 to allow
+ * us to handle non page-aligned buffer addresses and buffers
+ * whose sizes are not divisible by PAGE_SIZE.
+ */
+ Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) +
+ (num_src_buf * sizeof (CpaFlatBuffer));
+ Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) +
+ ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer));
+
+ if (QAT_PHYS_CONTIG_ALLOC(&in_pages,
+ num_src_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ if (QAT_PHYS_CONTIG_ALLOC(&out_pages,
+ num_dst_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ if (QAT_PHYS_CONTIG_ALLOC(&add_pages,
+ num_add_buf * sizeof (struct page *)) != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+ dc_inst_handle = dc_inst_handles[i];
+ session_handle = session_handles[i];
+
+ cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf,
+ &buffer_meta_size);
+ if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size) !=
+ CPA_STATUS_SUCCESS)
+ goto fail;
+
+ cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf,
+ &buffer_meta_size);
+ if (QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size) !=
+ CPA_STATUS_SUCCESS)
+ goto fail;
+
+ /* build source buffer list */
+ if (QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size) !=
+ CPA_STATUS_SUCCESS)
+ goto fail;
+
+ flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1);
+
+ buf_list_src->pBuffers = flat_buf_src; /* always point to first one */
+
+ /* build destination buffer list */
+ if (QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size) !=
+ CPA_STATUS_SUCCESS)
+ goto fail;
+
+ flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+
+ buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */
+
+ buf_list_src->numBuffers = 0;
+ buf_list_src->pPrivateMetaData = buffer_meta_src;
+ bytes_left = src_len;
+ data = src;
+ page_num = 0;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ page = qat_mem_to_page(data);
+ in_pages[page_num] = page;
+ flat_buf_src->pData = kmap(page) + page_off;
+ flat_buf_src->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+ bytes_left -= flat_buf_src->dataLenInBytes;
+ data += flat_buf_src->dataLenInBytes;
+ flat_buf_src++;
+ buf_list_src->numBuffers++;
+ page_num++;
+ }
+
+ buf_list_dst->numBuffers = 0;
+ buf_list_dst->pPrivateMetaData = buffer_meta_dst;
+ bytes_left = dst_len;
+ data = dst;
+ page_num = 0;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ page = qat_mem_to_page(data);
+ flat_buf_dst->pData = kmap(page) + page_off;
+ out_pages[page_num] = page;
+ flat_buf_dst->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+ bytes_left -= flat_buf_dst->dataLenInBytes;
+ data += flat_buf_dst->dataLenInBytes;
+ flat_buf_dst++;
+ buf_list_dst->numBuffers++;
+ page_num++;
+ dst_pages++;
+ }
+
+ /* map additional scratch pages into the destination buffer list */
+ bytes_left = add_len;
+ data = add;
+ page_num = 0;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ page = qat_mem_to_page(data);
+ flat_buf_dst->pData = kmap(page) + page_off;
+ add_pages[page_num] = page;
+ flat_buf_dst->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+ bytes_left -= flat_buf_dst->dataLenInBytes;
+ data += flat_buf_dst->dataLenInBytes;
+ flat_buf_dst++;
+ buf_list_dst->numBuffers++;
+ page_num++;
+ }
+
+ init_completion(&complete);
+
+ if (dir == QAT_COMPRESS) {
+ QAT_STAT_BUMP(comp_requests);
+ QAT_STAT_INCR(comp_total_in_bytes, src_len);
+
+ cpaDcGenerateHeader(session_handle,
+ buf_list_dst->pBuffers, &hdr_sz);
+ buf_list_dst->pBuffers->pData += hdr_sz;
+ buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz;
+ status = cpaDcCompressData(
+ dc_inst_handle, session_handle,
+ buf_list_src, buf_list_dst,
+ &dc_results, CPA_DC_FLUSH_FINAL,
+ &complete);
+ if (status != CPA_STATUS_SUCCESS) {
+ goto fail;
+ }
+
+ /* we now wait until the completion of the operation. */
+ if (!wait_for_completion_interruptible_timeout(&complete,
+ QAT_TIMEOUT_MS)) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ if (dc_results.status != CPA_STATUS_SUCCESS) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ compressed_sz = dc_results.produced;
+ if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) {
+ status = CPA_STATUS_INCOMPRESSIBLE;
+ goto fail;
+ }
+
+ flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+ /* move to the last page */
+ flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT;
+
+ /* no space for gzip footer in the last page */
+ if (((compressed_sz + hdr_sz) % PAGE_SIZE)
+ + ZLIB_FOOT_SZ > PAGE_SIZE) {
+ status = CPA_STATUS_INCOMPRESSIBLE;
+ goto fail;
+ }
+
+ /* jump to the end of the buffer and append footer */
+ flat_buf_dst->pData =
+ (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK)
+ + ((compressed_sz + hdr_sz) % PAGE_SIZE);
+ flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ;
+
+ dc_results.produced = 0;
+ status = cpaDcGenerateFooter(session_handle,
+ flat_buf_dst, &dc_results);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ *c_len = compressed_sz + dc_results.produced + hdr_sz;
+ QAT_STAT_INCR(comp_total_out_bytes, *c_len);
+ } else {
+ ASSERT3U(dir, ==, QAT_DECOMPRESS);
+ QAT_STAT_BUMP(decomp_requests);
+ QAT_STAT_INCR(decomp_total_in_bytes, src_len);
+
+ buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ;
+ buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ;
+ status = cpaDcDecompressData(dc_inst_handle, session_handle,
+ buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL,
+ &complete);
+
+ if (CPA_STATUS_SUCCESS != status) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ /* we now wait until the completion of the operation. */
+ if (!wait_for_completion_interruptible_timeout(&complete,
+ QAT_TIMEOUT_MS)) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ if (dc_results.status != CPA_STATUS_SUCCESS) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ /* verify adler checksum */
+ adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ);
+ if (adler32 != BSWAP_32(dc_results.checksum)) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+ *c_len = dc_results.produced;
+ QAT_STAT_INCR(decomp_total_out_bytes, *c_len);
+ }
+
+fail:
+ if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE)
+ QAT_STAT_BUMP(dc_fails);
+
+ if (in_pages) {
+ for (page_num = 0;
+ page_num < buf_list_src->numBuffers;
+ page_num++) {
+ kunmap(in_pages[page_num]);
+ }
+ QAT_PHYS_CONTIG_FREE(in_pages);
+ }
+
+ if (out_pages) {
+ for (page_num = 0; page_num < dst_pages; page_num++) {
+ kunmap(out_pages[page_num]);
+ }
+ QAT_PHYS_CONTIG_FREE(out_pages);
+ }
+
+ if (add_pages) {
+ for (page_num = 0;
+ page_num < buf_list_dst->numBuffers - dst_pages;
+ page_num++) {
+ kunmap(add_pages[page_num]);
+ }
+ QAT_PHYS_CONTIG_FREE(add_pages);
+ }
+
+ QAT_PHYS_CONTIG_FREE(buffer_meta_src);
+ QAT_PHYS_CONTIG_FREE(buffer_meta_dst);
+ QAT_PHYS_CONTIG_FREE(buf_list_src);
+ QAT_PHYS_CONTIG_FREE(buf_list_dst);
+
+ return (status);
+}
+
+/*
+ * Entry point for QAT accelerated compression / decompression.
+ */
+int
+qat_compress(qat_compress_dir_t dir, char *src, int src_len,
+ char *dst, int dst_len, size_t *c_len)
+{
+ int ret;
+ size_t add_len = 0;
+ void *add = NULL;
+
+ if (dir == QAT_COMPRESS) {
+ add_len = dst_len;
+ add = zio_data_buf_alloc(add_len);
+ }
+
+ ret = qat_compress_impl(dir, src, src_len, dst,
+ dst_len, add, add_len, c_len);
+
+ if (dir == QAT_COMPRESS)
+ zio_data_buf_free(add, add_len);
+
+ return (ret);
+}
+
+static int
+param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+ int *pvalue = kp->arg;
+ ret = param_set_int(val, kp);
+ if (ret)
+ return (ret);
+ /*
+ * zfs_qat_compress_disable = 0: enable qat compress
+ * try to initialize qat instance if it has not been done
+ */
+ if (*pvalue == 0 && !qat_dc_init_done) {
+ ret = qat_dc_init();
+ if (ret != 0) {
+ zfs_qat_compress_disable = 1;
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+module_param_call(zfs_qat_compress_disable, param_set_qat_compress,
+ param_get_int, &zfs_qat_compress_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression");
+
+#endif
diff --git a/module/os/linux/zfs/qat_crypt.c b/module/os/linux/zfs/qat_crypt.c
new file mode 100644
index 000000000..02e19d21d
--- /dev/null
+++ b/module/os/linux/zfs/qat_crypt.c
@@ -0,0 +1,631 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * This file represents the QAT implementation of checksums and encryption.
+ * Internally, QAT shares the same cryptographic instances for both of these
+ * operations, so the code has been combined here. QAT data compression uses
+ * compression instances, so that code is separated into qat_compress.c
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_crypt.h>
+#include "lac/cpa_cy_im.h"
+#include "lac/cpa_cy_common.h"
+#include "qat.h"
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instances
+ * and session arrays; the actual number of instances are defined in
+ * the QAT driver's configure file.
+ */
+#define QAT_CRYPT_MAX_INSTANCES 48
+
+#define MAX_PAGE_NUM 1024
+
+static Cpa32U inst_num = 0;
+static Cpa16U num_inst = 0;
+static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES];
+static boolean_t qat_cy_init_done = B_FALSE;
+int zfs_qat_encrypt_disable = 0;
+int zfs_qat_checksum_disable = 0;
+
+typedef struct cy_callback {
+ CpaBoolean verify_result;
+ struct completion complete;
+} cy_callback_t;
+
+static void
+symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation,
+ void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify)
+{
+ cy_callback_t *cb = p_callback;
+
+ if (cb != NULL) {
+ /* indicate that the function has been called */
+ cb->verify_result = verify;
+ complete(&cb->complete);
+ }
+}
+
+boolean_t
+qat_crypt_use_accel(size_t s_len)
+{
+ return (!zfs_qat_encrypt_disable &&
+ qat_cy_init_done &&
+ s_len >= QAT_MIN_BUF_SIZE &&
+ s_len <= QAT_MAX_BUF_SIZE);
+}
+
+boolean_t
+qat_checksum_use_accel(size_t s_len)
+{
+ return (!zfs_qat_checksum_disable &&
+ qat_cy_init_done &&
+ s_len >= QAT_MIN_BUF_SIZE &&
+ s_len <= QAT_MAX_BUF_SIZE);
+}
+
+void
+qat_cy_clean(void)
+{
+ for (Cpa16U i = 0; i < num_inst; i++)
+ cpaCyStopInstance(cy_inst_handles[i]);
+
+ num_inst = 0;
+ qat_cy_init_done = B_FALSE;
+}
+
+int
+qat_cy_init(void)
+{
+ CpaStatus status = CPA_STATUS_FAIL;
+
+ if (qat_cy_init_done)
+ return (0);
+
+ status = cpaCyGetNumInstances(&num_inst);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ /* if the user has configured no QAT encryption units just return */
+ if (num_inst == 0)
+ return (0);
+
+ if (num_inst > QAT_CRYPT_MAX_INSTANCES)
+ num_inst = QAT_CRYPT_MAX_INSTANCES;
+
+ status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]);
+ if (status != CPA_STATUS_SUCCESS)
+ return (-1);
+
+ for (Cpa16U i = 0; i < num_inst; i++) {
+ status = cpaCySetAddressTranslation(cy_inst_handles[i],
+ (void *)virt_to_phys);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+
+ status = cpaCyStartInstance(cy_inst_handles[i]);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+ }
+
+ qat_cy_init_done = B_TRUE;
+ return (0);
+
+error:
+ qat_cy_clean();
+ return (-1);
+}
+
+void
+qat_cy_fini(void)
+{
+ if (!qat_cy_init_done)
+ return;
+
+ qat_cy_clean();
+}
+
+static CpaStatus
+qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle,
+ CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key,
+ Cpa64U crypt, Cpa32U aad_len)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U ctx_size;
+ Cpa32U ciper_algorithm;
+ Cpa32U hash_algorithm;
+ CpaCySymSessionSetupData sd = { 0 };
+
+ if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) {
+ return (CPA_STATUS_FAIL);
+ } else {
+ ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM;
+ hash_algorithm = CPA_CY_SYM_HASH_AES_GCM;
+ }
+
+ sd.cipherSetupData.cipherAlgorithm = ciper_algorithm;
+ sd.cipherSetupData.pCipherKey = key->ck_data;
+ sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8;
+ sd.hashSetupData.hashAlgorithm = hash_algorithm;
+ sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
+ sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN;
+ sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len;
+ sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING;
+ sd.digestIsAppended = CPA_FALSE;
+ sd.verifyDigest = CPA_FALSE;
+
+ if (dir == QAT_ENCRYPT) {
+ sd.cipherSetupData.cipherDirection =
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+ sd.algChainOrder =
+ CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
+ } else {
+ ASSERT3U(dir, ==, QAT_DECRYPT);
+ sd.cipherSetupData.cipherDirection =
+ CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT;
+ sd.algChainOrder =
+ CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH;
+ }
+
+ status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+ *cy_session_ctx);
+ if (status != CPA_STATUS_SUCCESS) {
+ QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+ return (status);
+ }
+
+ return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle,
+ CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U ctx_size;
+ Cpa32U hash_algorithm;
+ CpaCySymSessionSetupData sd = { 0 };
+
+ /*
+ * ZFS's SHA512 checksum is actually SHA512/256, which uses
+ * a different IV from standard SHA512. QAT does not support
+ * SHA512/256, so we can only support SHA256.
+ */
+ if (cksum == ZIO_CHECKSUM_SHA256)
+ hash_algorithm = CPA_CY_SYM_HASH_SHA256;
+ else
+ return (CPA_STATUS_FAIL);
+
+ sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ sd.symOperation = CPA_CY_SYM_OP_HASH;
+ sd.hashSetupData.hashAlgorithm = hash_algorithm;
+ sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN;
+ sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t);
+ sd.digestIsAppended = CPA_FALSE;
+ sd.verifyDigest = CPA_FALSE;
+
+ status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+ *cy_session_ctx);
+ if (status != CPA_STATUS_SUCCESS) {
+ QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+ return (status);
+ }
+
+ return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs,
+ CpaBufferList *src, CpaBufferList *dst)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa32U meta_size = 0;
+
+ status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ return (status);
+
+ status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+
+ if (src != dst) {
+ status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData,
+ meta_size);
+ if (status != CPA_STATUS_SUCCESS)
+ goto error;
+ }
+
+ return (CPA_STATUS_SUCCESS);
+
+error:
+ QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData);
+ if (src != dst)
+ QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData);
+
+ return (status);
+}
+
+int
+qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
+ uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
+ crypto_key_t *key, uint64_t crypt, uint32_t enc_len)
+{
+ CpaStatus status = CPA_STATUS_SUCCESS;
+ Cpa16U i;
+ CpaInstanceHandle cy_inst_handle;
+ Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2;
+ Cpa32U bytes_left = 0;
+ Cpa8S *data = NULL;
+ CpaCySymSessionCtx *cy_session_ctx = NULL;
+ cy_callback_t cb;
+ CpaCySymOpData op_data = { 0 };
+ CpaBufferList src_buffer_list = { 0 };
+ CpaBufferList dst_buffer_list = { 0 };
+ CpaFlatBuffer *flat_src_buf_array = NULL;
+ CpaFlatBuffer *flat_src_buf = NULL;
+ CpaFlatBuffer *flat_dst_buf_array = NULL;
+ CpaFlatBuffer *flat_dst_buf = NULL;
+ struct page *in_pages[MAX_PAGE_NUM];
+ struct page *out_pages[MAX_PAGE_NUM];
+ Cpa32U in_page_num = 0;
+ Cpa32U out_page_num = 0;
+ Cpa32U in_page_off = 0;
+ Cpa32U out_page_off = 0;
+
+ if (dir == QAT_ENCRYPT) {
+ QAT_STAT_BUMP(encrypt_requests);
+ QAT_STAT_INCR(encrypt_total_in_bytes, enc_len);
+ } else {
+ QAT_STAT_BUMP(decrypt_requests);
+ QAT_STAT_INCR(decrypt_total_in_bytes, enc_len);
+ }
+
+ i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+ cy_inst_handle = cy_inst_handles[i];
+
+ status = qat_init_crypt_session_ctx(dir, cy_inst_handle,
+ &cy_session_ctx, key, crypt, aad_len);
+ if (status != CPA_STATUS_SUCCESS) {
+ /* don't count CCM as a failure since it's not supported */
+ if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM)
+ QAT_STAT_BUMP(crypt_fails);
+ return (status);
+ }
+
+ /*
+ * We increment nr_bufs by 2 to allow us to handle non
+ * page-aligned buffer addresses and buffers whose sizes
+ * are not divisible by PAGE_SIZE.
+ */
+ status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+ &src_buffer_list, &dst_buffer_list);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+ nr_bufs * sizeof (CpaFlatBuffer));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array,
+ nr_bufs * sizeof (CpaFlatBuffer));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult,
+ ZIO_DATA_MAC_LEN);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv,
+ ZIO_DATA_IV_LEN);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ if (aad_len > 0) {
+ status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData,
+ aad_len);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len);
+ }
+
+ bytes_left = enc_len;
+ data = src_buf;
+ flat_src_buf = flat_src_buf_array;
+ while (bytes_left > 0) {
+ in_page_off = ((long)data & ~PAGE_MASK);
+ in_pages[in_page_num] = qat_mem_to_page(data);
+ flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off;
+ flat_src_buf->dataLenInBytes =
+ min((long)PAGE_SIZE - in_page_off, (long)bytes_left);
+ data += flat_src_buf->dataLenInBytes;
+ bytes_left -= flat_src_buf->dataLenInBytes;
+ flat_src_buf++;
+ in_page_num++;
+ }
+ src_buffer_list.pBuffers = flat_src_buf_array;
+ src_buffer_list.numBuffers = in_page_num;
+
+ bytes_left = enc_len;
+ data = dst_buf;
+ flat_dst_buf = flat_dst_buf_array;
+ while (bytes_left > 0) {
+ out_page_off = ((long)data & ~PAGE_MASK);
+ out_pages[out_page_num] = qat_mem_to_page(data);
+ flat_dst_buf->pData = kmap(out_pages[out_page_num]) +
+ out_page_off;
+ flat_dst_buf->dataLenInBytes =
+ min((long)PAGE_SIZE - out_page_off, (long)bytes_left);
+ data += flat_dst_buf->dataLenInBytes;
+ bytes_left -= flat_dst_buf->dataLenInBytes;
+ flat_dst_buf++;
+ out_page_num++;
+ }
+ dst_buffer_list.pBuffers = flat_dst_buf_array;
+ dst_buffer_list.numBuffers = out_page_num;
+
+ op_data.sessionCtx = cy_session_ctx;
+ op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ op_data.cryptoStartSrcOffsetInBytes = 0;
+ op_data.messageLenToCipherInBytes = 0;
+ op_data.hashStartSrcOffsetInBytes = 0;
+ op_data.messageLenToHashInBytes = 0;
+ op_data.messageLenToCipherInBytes = enc_len;
+ op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
+ bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
+
+ cb.verify_result = CPA_FALSE;
+ init_completion(&cb.complete);
+ status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+ &src_buffer_list, &dst_buffer_list, NULL);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ if (!wait_for_completion_interruptible_timeout(&cb.complete,
+ QAT_TIMEOUT_MS)) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ if (cb.verify_result == CPA_FALSE) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ /* save digest result to digest_buf */
+ bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
+ if (dir == QAT_ENCRYPT)
+ QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
+ else
+ QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
+
+fail:
+ if (status != CPA_STATUS_SUCCESS)
+ QAT_STAT_BUMP(crypt_fails);
+
+ for (i = 0; i < in_page_num; i++)
+ kunmap(in_pages[i]);
+ for (i = 0; i < out_page_num; i++)
+ kunmap(out_pages[i]);
+
+ cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+ if (aad_len > 0)
+ QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData);
+ QAT_PHYS_CONTIG_FREE(op_data.pIv);
+ QAT_PHYS_CONTIG_FREE(op_data.pDigestResult);
+ QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+ QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+ QAT_PHYS_CONTIG_FREE(flat_dst_buf_array);
+
+ return (status);
+}
+
+int
+qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
+{
+ CpaStatus status;
+ Cpa16U i;
+ CpaInstanceHandle cy_inst_handle;
+ Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2;
+ Cpa32U bytes_left = 0;
+ Cpa8S *data = NULL;
+ CpaCySymSessionCtx *cy_session_ctx = NULL;
+ cy_callback_t cb;
+ Cpa8U *digest_buffer = NULL;
+ CpaCySymOpData op_data = { 0 };
+ CpaBufferList src_buffer_list = { 0 };
+ CpaFlatBuffer *flat_src_buf_array = NULL;
+ CpaFlatBuffer *flat_src_buf = NULL;
+ struct page *in_pages[MAX_PAGE_NUM];
+ Cpa32U page_num = 0;
+ Cpa32U page_off = 0;
+
+ QAT_STAT_BUMP(cksum_requests);
+ QAT_STAT_INCR(cksum_total_in_bytes, size);
+
+ i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+ cy_inst_handle = cy_inst_handles[i];
+
+ status = qat_init_checksum_session_ctx(cy_inst_handle,
+ &cy_session_ctx, cksum);
+ if (status != CPA_STATUS_SUCCESS) {
+ /* don't count unsupported checksums as a failure */
+ if (cksum == ZIO_CHECKSUM_SHA256 ||
+ cksum == ZIO_CHECKSUM_SHA512)
+ QAT_STAT_BUMP(cksum_fails);
+ return (status);
+ }
+
+ /*
+ * We increment nr_bufs by 2 to allow us to handle non
+ * page-aligned buffer addresses and buffers whose sizes
+ * are not divisible by PAGE_SIZE.
+ */
+ status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+ &src_buffer_list, &src_buffer_list);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+ nr_bufs * sizeof (CpaFlatBuffer));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+ status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer,
+ sizeof (zio_cksum_t));
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ bytes_left = size;
+ data = buf;
+ flat_src_buf = flat_src_buf_array;
+ while (bytes_left > 0) {
+ page_off = ((long)data & ~PAGE_MASK);
+ in_pages[page_num] = qat_mem_to_page(data);
+ flat_src_buf->pData = kmap(in_pages[page_num]) + page_off;
+ flat_src_buf->dataLenInBytes =
+ min((long)PAGE_SIZE - page_off, (long)bytes_left);
+ data += flat_src_buf->dataLenInBytes;
+ bytes_left -= flat_src_buf->dataLenInBytes;
+ flat_src_buf++;
+ page_num++;
+ }
+ src_buffer_list.pBuffers = flat_src_buf_array;
+ src_buffer_list.numBuffers = page_num;
+
+ op_data.sessionCtx = cy_session_ctx;
+ op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ op_data.hashStartSrcOffsetInBytes = 0;
+ op_data.messageLenToHashInBytes = size;
+ op_data.pDigestResult = digest_buffer;
+
+ cb.verify_result = CPA_FALSE;
+ init_completion(&cb.complete);
+ status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+ &src_buffer_list, &src_buffer_list, NULL);
+ if (status != CPA_STATUS_SUCCESS)
+ goto fail;
+
+ if (!wait_for_completion_interruptible_timeout(&cb.complete,
+ QAT_TIMEOUT_MS)) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+ if (cb.verify_result == CPA_FALSE) {
+ status = CPA_STATUS_FAIL;
+ goto fail;
+ }
+
+ bcopy(digest_buffer, zcp, sizeof (zio_cksum_t));
+
+fail:
+ if (status != CPA_STATUS_SUCCESS)
+ QAT_STAT_BUMP(cksum_fails);
+
+ for (i = 0; i < page_num; i++)
+ kunmap(in_pages[i]);
+
+ cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+ QAT_PHYS_CONTIG_FREE(digest_buffer);
+ QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+ QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+ QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+
+ return (status);
+}
+
+static int
+param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+ int *pvalue = kp->arg;
+ ret = param_set_int(val, kp);
+ if (ret)
+ return (ret);
+ /*
+ * zfs_qat_encrypt_disable = 0: enable qat encrypt
+ * try to initialize qat instance if it has not been done
+ */
+ if (*pvalue == 0 && !qat_cy_init_done) {
+ ret = qat_cy_init();
+ if (ret != 0) {
+ zfs_qat_encrypt_disable = 1;
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+static int
+param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
+{
+ int ret;
+ int *pvalue = kp->arg;
+ ret = param_set_int(val, kp);
+ if (ret)
+ return (ret);
+ /*
+ * set_checksum_param_ops = 0: enable qat checksum
+ * try to initialize qat instance if it has not been done
+ */
+ if (*pvalue == 0 && !qat_cy_init_done) {
+ ret = qat_cy_init();
+ if (ret != 0) {
+ zfs_qat_checksum_disable = 1;
+ return (ret);
+ }
+ }
+ return (ret);
+}
+
+module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt,
+ param_get_int, &zfs_qat_encrypt_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption");
+
+module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum,
+ param_get_int, &zfs_qat_checksum_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming");
+
+#endif
diff --git a/module/os/linux/zfs/spa_stats.c b/module/os/linux/zfs/spa_stats.c
new file mode 100644
index 000000000..6895428f4
--- /dev/null
+++ b/module/os/linux/zfs/spa_stats.c
@@ -0,0 +1,1034 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * Keeps stats on the last 100 txgs by default.
+ */
+int zfs_txg_history = 100;
+
+/*
+ * Keeps stats on the last N MMP updates, disabled by default.
+ */
+int zfs_multihost_history = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+ hrtime_t start; /* time read completed */
+ uint64_t objset; /* read from this objset */
+ uint64_t object; /* read of this object number */
+ uint64_t level; /* block's indirection level */
+ uint64_t blkid; /* read of this block id */
+ char origin[24]; /* read originated from here */
+ uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
+ pid_t pid; /* PID of task doing read */
+ char comm[16]; /* process name of task doing read */
+ procfs_list_node_t srh_node;
+} spa_read_history_t;
+
+static int
+spa_read_history_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+ "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+ "level", "blkid", "aflags", "origin", "pid", "process");
+
+ return (0);
+}
+
+static int
+spa_read_history_show(struct seq_file *f, void *data)
+{
+ spa_read_history_t *srh = (spa_read_history_t *)data;
+
+ seq_printf(f, "%-8llu %-16llu 0x%-6llx "
+ "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+ (u_longlong_t)srh->srh_node.pln_id, srh->start,
+ (longlong_t)srh->objset, (longlong_t)srh->object,
+ (longlong_t)srh->level, (longlong_t)srh->blkid,
+ srh->aflags, srh->origin, srh->pid, srh->comm);
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_read_history_t *srh;
+ while (shl->size > size) {
+ srh = list_remove_head(&shl->procfs_list.pl_list);
+ ASSERT3P(srh, !=, NULL);
+ kmem_free(srh, sizeof (spa_read_history_t));
+ shl->size--;
+ }
+
+ if (size == 0)
+ ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+}
+
+static int
+spa_read_history_clear(procfs_list_t *procfs_list)
+{
+ spa_history_list_t *shl = procfs_list->pl_private;
+ mutex_enter(&procfs_list->pl_lock);
+ spa_read_history_truncate(shl, 0);
+ mutex_exit(&procfs_list->pl_lock);
+ return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.read_history;
+ char *module;
+
+ shl->size = 0;
+
+ module = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ shl->procfs_list.pl_private = shl;
+ procfs_list_install(module,
+ "reads",
+ 0600,
+ &shl->procfs_list,
+ spa_read_history_show,
+ spa_read_history_show_header,
+ spa_read_history_clear,
+ offsetof(spa_read_history_t, srh_node));
+
+ strfree(module);
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.read_history;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_read_history_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
+{
+ spa_history_list_t *shl = &spa->spa_stats.read_history;
+ spa_read_history_t *srh;
+
+ ASSERT3P(spa, !=, NULL);
+ ASSERT3P(zb, !=, NULL);
+
+ if (zfs_read_history == 0 && shl->size == 0)
+ return;
+
+ if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
+ return;
+
+ srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
+ strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
+ srh->start = gethrtime();
+ srh->objset = zb->zb_objset;
+ srh->object = zb->zb_object;
+ srh->level = zb->zb_level;
+ srh->blkid = zb->zb_blkid;
+ srh->aflags = aflags;
+ srh->pid = getpid();
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+
+ procfs_list_add(&shl->procfs_list, srh);
+ shl->size++;
+
+ spa_read_history_truncate(shl, zfs_read_history);
+
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA TXG History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Txg statistics - Information exported regarding each txg sync
+ */
+
+typedef struct spa_txg_history {
+ uint64_t txg; /* txg id */
+ txg_state_t state; /* active txg state */
+ uint64_t nread; /* number of bytes read */
+ uint64_t nwritten; /* number of bytes written */
+ uint64_t reads; /* number of read operations */
+ uint64_t writes; /* number of write operations */
+ uint64_t ndirty; /* number of dirty bytes */
+ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
+ procfs_list_node_t sth_node;
+} spa_txg_history_t;
+
+static int
+spa_txg_history_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
+ "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
+ "ndirty", "nread", "nwritten", "reads", "writes",
+ "otime", "qtime", "wtime", "stime");
+ return (0);
+}
+
+static int
+spa_txg_history_show(struct seq_file *f, void *data)
+{
+ spa_txg_history_t *sth = (spa_txg_history_t *)data;
+ uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
+ char state;
+
+ switch (sth->state) {
+ case TXG_STATE_BIRTH: state = 'B'; break;
+ case TXG_STATE_OPEN: state = 'O'; break;
+ case TXG_STATE_QUIESCED: state = 'Q'; break;
+ case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break;
+ case TXG_STATE_SYNCED: state = 'S'; break;
+ case TXG_STATE_COMMITTED: state = 'C'; break;
+ default: state = '?'; break;
+ }
+
+ if (sth->times[TXG_STATE_OPEN])
+ open = sth->times[TXG_STATE_OPEN] -
+ sth->times[TXG_STATE_BIRTH];
+
+ if (sth->times[TXG_STATE_QUIESCED])
+ quiesce = sth->times[TXG_STATE_QUIESCED] -
+ sth->times[TXG_STATE_OPEN];
+
+ if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
+ wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
+ sth->times[TXG_STATE_QUIESCED];
+
+ if (sth->times[TXG_STATE_SYNCED])
+ sync = sth->times[TXG_STATE_SYNCED] -
+ sth->times[TXG_STATE_WAIT_FOR_SYNC];
+
+ seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
+ "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
+ (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
+ (u_longlong_t)sth->ndirty,
+ (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
+ (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
+ (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
+ (u_longlong_t)sync);
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_txg_history_t *sth;
+ while (shl->size > size) {
+ sth = list_remove_head(&shl->procfs_list.pl_list);
+ ASSERT3P(sth, !=, NULL);
+ kmem_free(sth, sizeof (spa_txg_history_t));
+ shl->size--;
+ }
+
+ if (size == 0)
+ ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_txg_history_clear(procfs_list_t *procfs_list)
+{
+ spa_history_list_t *shl = procfs_list->pl_private;
+ mutex_enter(&procfs_list->pl_lock);
+ spa_txg_history_truncate(shl, 0);
+ mutex_exit(&procfs_list->pl_lock);
+ return (0);
+}
+
+static void
+spa_txg_history_init(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ char *module;
+
+ shl->size = 0;
+
+ module = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ shl->procfs_list.pl_private = shl;
+ procfs_list_install(module,
+ "txgs",
+ 0644,
+ &shl->procfs_list,
+ spa_txg_history_show,
+ spa_txg_history_show_header,
+ spa_txg_history_clear,
+ offsetof(spa_txg_history_t, sth_node));
+
+ strfree(module);
+}
+
+static void
+spa_txg_history_destroy(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_txg_history_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Add a new txg to historical record.
+ */
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+
+ if (zfs_txg_history == 0 && shl->size == 0)
+ return;
+
+ sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
+ sth->txg = txg;
+ sth->state = TXG_STATE_OPEN;
+ sth->times[TXG_STATE_BIRTH] = birth_time;
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ procfs_list_add(&shl->procfs_list, sth);
+ shl->size++;
+ spa_txg_history_truncate(shl, zfs_txg_history);
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+ hrtime_t completed_time)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ int error = ENOENT;
+
+ if (zfs_txg_history == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+ sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+ if (sth->txg == txg) {
+ sth->times[completed_state] = completed_time;
+ sth->state++;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * Set txg IO stats.
+ */
+static int
+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+ uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
+{
+ spa_history_list_t *shl = &spa->spa_stats.txg_history;
+ spa_txg_history_t *sth;
+ int error = ENOENT;
+
+ if (zfs_txg_history == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+ sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+ if (sth->txg == txg) {
+ sth->nread = nread;
+ sth->nwritten = nwritten;
+ sth->reads = reads;
+ sth->writes = writes;
+ sth->ndirty = ndirty;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+ txg_stat_t *ts;
+
+ if (zfs_txg_history == 0)
+ return (NULL);
+
+ ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ ts->txg = txg;
+ ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+ spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
+
+ return (ts);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+ if (ts == NULL)
+ return;
+
+ if (zfs_txg_history == 0) {
+ kmem_free(ts, sizeof (txg_stat_t));
+ return;
+ }
+
+ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+ vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
+ spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+ spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
+ spa_txg_history_set_io(spa, ts->txg,
+ ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
+ ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
+ ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
+ ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
+ ts->ndirty);
+
+ kmem_free(ts, sizeof (txg_stat_t));
+}
+
+/*
+ * ==========================================================================
+ * SPA TX Assign Histogram Routines
+ * ==========================================================================
+ */
+
+/*
+ * Tx statistics - Information exported regarding dmu_tx_assign time.
+ */
+
+/*
+ * When the kstat is written zero all buckets. When the kstat is read
+ * count the number of trailing buckets set to zero and update ks_ndata
+ * such that they are not output.
+ */
+static int
+spa_tx_assign_update(kstat_t *ksp, int rw)
+{
+ spa_t *spa = ksp->ks_private;
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ int i;
+
+ if (rw == KSTAT_WRITE) {
+ for (i = 0; i < shk->count; i++)
+ ((kstat_named_t *)shk->private)[i].value.ui64 = 0;
+ }
+
+ for (i = shk->count; i > 0; i--)
+ if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0)
+ break;
+
+ ksp->ks_ndata = i;
+ ksp->ks_data_size = i * sizeof (kstat_named_t);
+
+ return (0);
+}
+
+static void
+spa_tx_assign_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ char *name;
+ kstat_named_t *ks;
+ kstat_t *ksp;
+ int i;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ shk->count = 42; /* power of two buckets for 1ns to 2,199s */
+ shk->size = shk->count * sizeof (kstat_named_t);
+ shk->private = kmem_alloc(shk->size, KM_SLEEP);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ for (i = 0; i < shk->count; i++) {
+ ks = &((kstat_named_t *)shk->private)[i];
+ ks->data_type = KSTAT_DATA_UINT64;
+ ks->value.ui64 = 0;
+ (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
+ (u_longlong_t)1 << i);
+ }
+
+ ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
+ KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
+ shk->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_data = shk->private;
+ ksp->ks_ndata = shk->count;
+ ksp->ks_data_size = shk->size;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_tx_assign_update;
+ kstat_install(ksp);
+ }
+ strfree(name);
+}
+
+static void
+spa_tx_assign_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ kstat_t *ksp;
+
+ ksp = shk->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ kmem_free(shk->private, shk->size);
+ mutex_destroy(&shk->lock);
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+ uint64_t idx = 0;
+
+ while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
+ idx++;
+
+ atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64);
+}
+
+/*
+ * ==========================================================================
+ * SPA IO History Routines
+ * ==========================================================================
+ */
+static int
+spa_io_history_update(kstat_t *ksp, int rw)
+{
+ if (rw == KSTAT_WRITE)
+ memset(ksp->ks_data, 0, ksp->ks_data_size);
+
+ return (0);
+}
+
+static void
+spa_io_history_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+ char *name;
+ kstat_t *ksp;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
+ shk->kstat = ksp;
+
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_io_history_update;
+ kstat_install(ksp);
+ }
+ strfree(name);
+}
+
+static void
+spa_io_history_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+ if (shk->kstat)
+ kstat_delete(shk->kstat);
+
+ mutex_destroy(&shk->lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA MMP History Routines
+ * ==========================================================================
+ */
+
+/*
+ * MMP statistics - Information exported regarding attempted MMP writes
+ * For MMP writes issued, fields used as per comments below.
+ * For MMP writes skipped, an entry represents a span of time when
+ * writes were skipped for same reason (error from mmp_random_leaf).
+ * Differences are:
+ * timestamp time first write skipped, if >1 skipped in a row
+ * mmp_delay delay value at timestamp
+ * vdev_guid number of writes skipped
+ * io_error one of enum mmp_error
+ * duration time span (ns) of skipped writes
+ */
+
+typedef struct spa_mmp_history {
+ uint64_t mmp_node_id; /* unique # for updates */
+ uint64_t txg; /* txg of last sync */
+ uint64_t timestamp; /* UTC time MMP write issued */
+ uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */
+ uint64_t vdev_guid; /* unique ID of leaf vdev */
+ char *vdev_path;
+ int vdev_label; /* vdev label */
+ int io_error; /* error status of MMP write */
+ hrtime_t error_start; /* hrtime of start of error period */
+ hrtime_t duration; /* time from submission to completion */
+ procfs_list_node_t smh_node;
+} spa_mmp_history_t;
+
+static int
+spa_mmp_history_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
+ "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
+ "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
+ return (0);
+}
+
+static int
+spa_mmp_history_show(struct seq_file *f, void *data)
+{
+ spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
+ char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
+ "%-10lld %s\n";
+ char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
+ "%-10lld %s\n";
+
+ seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
+ (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
+ (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
+ (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
+ (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
+ (smh->vdev_path ? smh->vdev_path : "-"));
+
+ return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+ spa_mmp_history_t *smh;
+ while (shl->size > size) {
+ smh = list_remove_head(&shl->procfs_list.pl_list);
+ if (smh->vdev_path)
+ strfree(smh->vdev_path);
+ kmem_free(smh, sizeof (spa_mmp_history_t));
+ shl->size--;
+ }
+
+ if (size == 0)
+ ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_mmp_history_clear(procfs_list_t *procfs_list)
+{
+ spa_history_list_t *shl = procfs_list->pl_private;
+ mutex_enter(&procfs_list->pl_lock);
+ spa_mmp_history_truncate(shl, 0);
+ mutex_exit(&procfs_list->pl_lock);
+ return (0);
+}
+
+static void
+spa_mmp_history_init(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ char *module;
+
+ shl->size = 0;
+
+ module = kmem_asprintf("zfs/%s", spa_name(spa));
+
+ shl->procfs_list.pl_private = shl;
+ procfs_list_install(module,
+ "multihost",
+ 0644,
+ &shl->procfs_list,
+ spa_mmp_history_show,
+ spa_mmp_history_show_header,
+ spa_mmp_history_clear,
+ offsetof(spa_mmp_history_t, smh_node));
+
+ strfree(module);
+}
+
+static void
+spa_mmp_history_destroy(spa_t *spa)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ procfs_list_uninstall(&shl->procfs_list);
+ spa_mmp_history_truncate(shl, 0);
+ procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Set duration in existing "skip" record to how long we have waited for a leaf
+ * vdev to become available.
+ *
+ * Important that we start search at the tail of the list where new
+ * records are inserted, so this is normally an O(1) operation.
+ */
+int
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ spa_mmp_history_t *smh;
+ int error = ENOENT;
+
+ if (zfs_multihost_history == 0 && shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+ smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+ if (smh->mmp_node_id == mmp_node_id) {
+ ASSERT3U(smh->io_error, !=, 0);
+ smh->duration = gethrtime() - smh->error_start;
+ smh->vdev_guid++;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * Set MMP write duration and error status in existing record.
+ * See comment re: search order above spa_mmp_history_set_skip().
+ */
+int
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
+ hrtime_t duration)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ spa_mmp_history_t *smh;
+ int error = ENOENT;
+
+ if (zfs_multihost_history == 0 && shl->size == 0)
+ return (0);
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+ smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+ if (smh->mmp_node_id == mmp_node_id) {
+ ASSERT(smh->io_error == 0);
+ smh->io_error = io_error;
+ smh->duration = duration;
+ error = 0;
+ break;
+ }
+ }
+ mutex_exit(&shl->procfs_list.pl_lock);
+
+ return (error);
+}
+
+/*
+ * Add a new MMP historical record.
+ * error == 0 : a write was issued.
+ * error != 0 : a write was not issued because no leaves were found.
+ */
+void
+spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+ uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
+ int error)
+{
+ spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+ spa_mmp_history_t *smh;
+
+ if (zfs_multihost_history == 0 && shl->size == 0)
+ return;
+
+ smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
+ smh->txg = txg;
+ smh->timestamp = timestamp;
+ smh->mmp_delay = mmp_delay;
+ if (vd) {
+ smh->vdev_guid = vd->vdev_guid;
+ if (vd->vdev_path)
+ smh->vdev_path = strdup(vd->vdev_path);
+ }
+ smh->vdev_label = label;
+ smh->mmp_node_id = mmp_node_id;
+
+ if (error) {
+ smh->io_error = error;
+ smh->error_start = gethrtime();
+ smh->vdev_guid = 1;
+ }
+
+ mutex_enter(&shl->procfs_list.pl_lock);
+ procfs_list_add(&shl->procfs_list, smh);
+ shl->size++;
+ spa_mmp_history_truncate(shl, zfs_multihost_history);
+ mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+static void *
+spa_state_addr(kstat_t *ksp, loff_t n)
+{
+ return (ksp->ks_private); /* return the spa_t */
+}
+
+static int
+spa_state_data(char *buf, size_t size, void *data)
+{
+ spa_t *spa = (spa_t *)data;
+ (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
+ return (0);
+}
+
+/*
+ * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
+ *
+ * This is a lock-less read of the pool's state (unlike using 'zpool', which
+ * can potentially block for seconds). Because it doesn't block, it can useful
+ * as a pool heartbeat value.
+ */
+static void
+spa_state_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.state;
+ char *name;
+ kstat_t *ksp;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ name = kmem_asprintf("zfs/%s", spa_name(spa));
+ ksp = kstat_create(name, 0, "state", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ shk->kstat = ksp;
+ if (ksp) {
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_data = NULL;
+ ksp->ks_private = spa;
+ ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+ kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
+ kstat_install(ksp);
+ }
+
+ strfree(name);
+}
+
+static void
+spa_health_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.state;
+ kstat_t *ksp = shk->kstat;
+ if (ksp)
+ kstat_delete(ksp);
+
+ mutex_destroy(&shk->lock);
+}
+
+static spa_iostats_t spa_iostats_template = {
+ { "trim_extents_written", KSTAT_DATA_UINT64 },
+ { "trim_bytes_written", KSTAT_DATA_UINT64 },
+ { "trim_extents_skipped", KSTAT_DATA_UINT64 },
+ { "trim_bytes_skipped", KSTAT_DATA_UINT64 },
+ { "trim_extents_failed", KSTAT_DATA_UINT64 },
+ { "trim_bytes_failed", KSTAT_DATA_UINT64 },
+ { "autotrim_extents_written", KSTAT_DATA_UINT64 },
+ { "autotrim_bytes_written", KSTAT_DATA_UINT64 },
+ { "autotrim_extents_skipped", KSTAT_DATA_UINT64 },
+ { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 },
+ { "autotrim_extents_failed", KSTAT_DATA_UINT64 },
+ { "autotrim_bytes_failed", KSTAT_DATA_UINT64 },
+};
+
+#define SPA_IOSTATS_ADD(stat, val) \
+ atomic_add_64(&iostats->stat.value.ui64, (val));
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+ uint64_t extents_written, uint64_t bytes_written,
+ uint64_t extents_skipped, uint64_t bytes_skipped,
+ uint64_t extents_failed, uint64_t bytes_failed)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+ spa_iostats_t *iostats;
+
+ if (ksp == NULL)
+ return;
+
+ iostats = ksp->ks_data;
+ if (type == TRIM_TYPE_MANUAL) {
+ SPA_IOSTATS_ADD(trim_extents_written, extents_written);
+ SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
+ SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
+ SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
+ SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
+ SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
+ } else {
+ SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
+ SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
+ SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
+ SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
+ SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
+ SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+ }
+}
+
+int
+spa_iostats_update(kstat_t *ksp, int rw)
+{
+ if (rw == KSTAT_WRITE) {
+ memcpy(ksp->ks_data, &spa_iostats_template,
+ sizeof (spa_iostats_t));
+ }
+
+ return (0);
+}
+
+static void
+spa_iostats_init(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+
+ mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+ char *name = kmem_asprintf("zfs/%s", spa_name(spa));
+ kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ shk->kstat = ksp;
+ if (ksp) {
+ int size = sizeof (spa_iostats_t);
+ ksp->ks_lock = &shk->lock;
+ ksp->ks_private = spa;
+ ksp->ks_update = spa_iostats_update;
+ ksp->ks_data = kmem_alloc(size, KM_SLEEP);
+ memcpy(ksp->ks_data, &spa_iostats_template, size);
+ kstat_install(ksp);
+ }
+
+ strfree(name);
+}
+
+static void
+spa_iostats_destroy(spa_t *spa)
+{
+ spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+ kstat_t *ksp = shk->kstat;
+ if (ksp) {
+ kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
+ kstat_delete(ksp);
+ }
+
+ mutex_destroy(&shk->lock);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+ spa_read_history_init(spa);
+ spa_txg_history_init(spa);
+ spa_tx_assign_init(spa);
+ spa_io_history_init(spa);
+ spa_mmp_history_init(spa);
+ spa_state_init(spa);
+ spa_iostats_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+ spa_iostats_destroy(spa);
+ spa_health_destroy(spa);
+ spa_tx_assign_destroy(spa);
+ spa_txg_history_destroy(spa);
+ spa_read_history_destroy(spa);
+ spa_io_history_destroy(spa);
+ spa_mmp_history_destroy(spa);
+}
+
+#if defined(_KERNEL)
+/* CSTYLED */
+module_param(zfs_read_history, int, 0644);
+MODULE_PARM_DESC(zfs_read_history,
+ "Historical statistics for the last N reads");
+
+module_param(zfs_read_history_hits, int, 0644);
+MODULE_PARM_DESC(zfs_read_history_hits,
+ "Include cache hits in read history");
+
+module_param(zfs_txg_history, int, 0644);
+MODULE_PARM_DESC(zfs_txg_history,
+ "Historical statistics for the last N txgs");
+
+module_param(zfs_multihost_history, int, 0644);
+MODULE_PARM_DESC(zfs_multihost_history,
+ "Historical statistics for last N multihost writes");
+/* END CSTYLED */
+#endif
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
new file mode 100644
index 000000000..21f9ae454
--- /dev/null
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -0,0 +1,954 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <[email protected]>.
+ * LLNL-CODE-403049.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <linux/msdos_fs.h>
+#include <linux/vfs_compat.h>
+
+char *zfs_vdev_scheduler = VDEV_SCHEDULER;
+static void *zfs_vdev_holder = VDEV_HOLDER;
+
+/* size of the "reserved" partition, in blocks */
+#define EFI_MIN_RESV_SIZE (16 * 1024)
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+ zio_t *dr_zio; /* Parent ZIO */
+ atomic_t dr_ref; /* References */
+ int dr_error; /* Bio error */
+ int dr_bio_count; /* Count of bio's */
+ struct bio *dr_bio[0]; /* Attached bio's */
+} dio_request_t;
+
+
+#if defined(HAVE_OPEN_BDEV_EXCLUSIVE) || defined(HAVE_BLKDEV_GET_BY_PATH)
+static fmode_t
+vdev_bdev_mode(int smode)
+{
+ fmode_t mode = 0;
+
+ ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
+
+ if (smode & FREAD)
+ mode |= FMODE_READ;
+
+ if (smode & FWRITE)
+ mode |= FMODE_WRITE;
+
+ return (mode);
+}
+#else
+static int
+vdev_bdev_mode(int smode)
+{
+ int mode = 0;
+
+ ASSERT3S(smode & (FREAD | FWRITE), !=, 0);
+
+ if ((smode & FREAD) && !(smode & FWRITE))
+ mode = SB_RDONLY;
+
+ return (mode);
+}
+#endif /* HAVE_OPEN_BDEV_EXCLUSIVE */
+
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
+static uint64_t
+bdev_capacity(struct block_device *bdev)
+{
+ return (i_size_read(bdev->bd_inode));
+}
+
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity. Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate. Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ *
+ * The returned maximum expansion capacity is always expected to be larger, or
+ * at the very least equal, to its usable capacity to prevent overestimating
+ * the pool expandsize.
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+ uint64_t psize;
+ int64_t available;
+
+ if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) {
+ /*
+ * When reporting maximum expansion capacity for a wholedisk
+ * deduct any capacity which is expected to be lost due to
+ * alignment restrictions. Over reporting this value isn't
+ * harmful and would only result in slightly less capacity
+ * than expected post expansion.
+ * The estimated available space may be slightly smaller than
+ * bdev_capacity() for devices where the number of sectors is
+ * not a multiple of the alignment size and the partition layout
+ * is keeping less than PARTITION_END_ALIGNMENT bytes after the
+ * "reserved" EFI partition: in such cases return the device
+ * usable capacity.
+ */
+ available = i_size_read(bdev->bd_contains->bd_inode) -
+ ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+ PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+ psize = MAX(available, bdev_capacity(bdev));
+ } else {
+ psize = bdev_capacity(bdev);
+ }
+
+ return (psize);
+}
+
+static void
+vdev_disk_error(zio_t *zio)
+{
+ /*
+ * This function can be called in interrupt context, for instance while
+ * handling IRQs coming from a misbehaving disk device; use printk()
+ * which is safe from any context.
+ */
+ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
+ "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
+ zio->io_vd->vdev_path, zio->io_error, zio->io_type,
+ (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
+ zio->io_flags);
+}
+
+/*
+ * Use the Linux 'noop' elevator for zfs managed block devices. This
+ * strikes the ideal balance by allowing the zfs elevator to do all
+ * request ordering and prioritization. While allowing the Linux
+ * elevator to do the maximum front/back merging allowed by the
+ * physical device. This yields the largest possible requests for
+ * the device with the lowest total overhead.
+ */
+static void
+vdev_elevator_switch(vdev_t *v, char *elevator)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+ struct request_queue *q;
+ char *device;
+ int error;
+
+ for (int c = 0; c < v->vdev_children; c++)
+ vdev_elevator_switch(v->vdev_child[c], elevator);
+
+ if (!v->vdev_ops->vdev_op_leaf || vd->vd_bdev == NULL)
+ return;
+
+ q = bdev_get_queue(vd->vd_bdev);
+ device = vd->vd_bdev->bd_disk->disk_name;
+
+ /*
+ * Skip devices which are not whole disks (partitions).
+ * Device-mapper devices are excepted since they may be whole
+ * disks despite the vdev_wholedisk flag, in which case we can
+ * and should switch the elevator. If the device-mapper device
+ * does not have an elevator (i.e. dm-raid, dm-crypt, etc.) the
+ * "Skip devices without schedulers" check below will fail.
+ */
+ if (!v->vdev_wholedisk && strncmp(device, "dm-", 3) != 0)
+ return;
+
+ /* Leave existing scheduler when set to "none" */
+ if ((strncmp(elevator, "none", 4) == 0) && (strlen(elevator) == 4))
+ return;
+
+ /*
+ * The elevator_change() function was available in kernels from
+ * 2.6.36 to 4.11. When not available fall back to using the user
+ * mode helper functionality to set the elevator via sysfs. This
+ * requires /bin/echo and sysfs to be mounted which may not be true
+ * early in the boot process.
+ */
+#ifdef HAVE_ELEVATOR_CHANGE
+ error = elevator_change(q, elevator);
+#else
+#define SET_SCHEDULER_CMD \
+ "exec 0</dev/null " \
+ " 1>/sys/block/%s/queue/scheduler " \
+ " 2>/dev/null; " \
+ "echo %s"
+
+ char *argv[] = { "/bin/sh", "-c", NULL, NULL };
+ char *envp[] = { NULL };
+
+ argv[2] = kmem_asprintf(SET_SCHEDULER_CMD, device, elevator);
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ strfree(argv[2]);
+#endif /* HAVE_ELEVATOR_CHANGE */
+ if (error) {
+ zfs_dbgmsg("Unable to set \"%s\" scheduler for %s (%s): %d",
+ elevator, v->vdev_path, device, error);
+ }
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *ashift)
+{
+ struct block_device *bdev;
+ fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+ int count = 0, block_size;
+ int bdev_retry_count = 50;
+ vdev_disk_t *vd;
+
+ /* Must have a pathname and it must be absolute. */
+ if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
+ v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ vdev_dbgmsg(v, "invalid vdev_path");
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it is currently open. When expanding a
+ * partition force re-scanning the partition table while closed
+ * in order to get an accurate updated block device size. Then
+ * since udev may need to recreate the device links increase the
+ * open retry count before reporting the device as unavailable.
+ */
+ vd = v->vdev_tsd;
+ if (vd) {
+ char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+ boolean_t reread_part = B_FALSE;
+
+ rw_enter(&vd->vd_lock, RW_WRITER);
+ bdev = vd->vd_bdev;
+ vd->vd_bdev = NULL;
+
+ if (bdev) {
+ if (v->vdev_expanding && bdev != bdev->bd_contains) {
+ bdevname(bdev->bd_contains, disk_name + 5);
+ reread_part = B_TRUE;
+ }
+
+ vdev_bdev_close(bdev, mode);
+ }
+
+ if (reread_part) {
+ bdev = vdev_bdev_open(disk_name, mode, zfs_vdev_holder);
+ if (!IS_ERR(bdev)) {
+ int error = vdev_bdev_reread_part(bdev);
+ vdev_bdev_close(bdev, mode);
+ if (error == 0)
+ bdev_retry_count = 100;
+ }
+ }
+ } else {
+ vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+ rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+ rw_enter(&vd->vd_lock, RW_WRITER);
+ }
+
+ /*
+ * Devices are always opened by the path provided at configuration
+ * time. This means that if the provided path is a udev by-id path
+ * then drives may be re-cabled without an issue. If the provided
+ * path is a udev by-path path, then the physical location information
+ * will be preserved. This can be critical for more complicated
+ * configurations where drives are located in specific physical
+ * locations to maximize the systems tolerance to component failure.
+ *
+ * Alternatively, you can provide your own udev rule to flexibly map
+ * the drives as you see fit. It is not advised that you use the
+ * /dev/[hd]d devices which may be reordered due to probing order.
+ * Devices in the wrong locations will be detected by the higher
+ * level vdev validation.
+ *
+ * The specified paths may be briefly removed and recreated in
+ * response to udev events. This should be exceptionally unlikely
+ * because the zpool command makes every effort to verify these paths
+ * have already settled prior to reaching this point. Therefore,
+ * a ENOENT failure at this point is highly likely to be transient
+ * and it is reasonable to sleep and retry before giving up. In
+ * practice delays have been observed to be on the order of 100ms.
+ */
+ bdev = ERR_PTR(-ENXIO);
+ while (IS_ERR(bdev) && count < bdev_retry_count) {
+ bdev = vdev_bdev_open(v->vdev_path, mode, zfs_vdev_holder);
+ if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+ schedule_timeout(MSEC_TO_TICK(10));
+ count++;
+ } else if (IS_ERR(bdev)) {
+ break;
+ }
+ }
+
+ if (IS_ERR(bdev)) {
+ int error = -PTR_ERR(bdev);
+ vdev_dbgmsg(v, "open error=%d count=%d", error, count);
+ vd->vd_bdev = NULL;
+ v->vdev_tsd = vd;
+ rw_exit(&vd->vd_lock);
+ return (SET_ERROR(error));
+ } else {
+ vd->vd_bdev = bdev;
+ v->vdev_tsd = vd;
+ rw_exit(&vd->vd_lock);
+ }
+
+ struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
+ /* Determine the physical block size */
+ block_size = vdev_bdev_block_size(vd->vd_bdev);
+
+ /* Clear the nowritecache bit, causes vdev_reopen() to try again. */
+ v->vdev_nowritecache = B_FALSE;
+
+ /* Set when device reports it supports TRIM. */
+ v->vdev_has_trim = !!blk_queue_discard(q);
+
+ /* Set when device reports it supports secure TRIM. */
+ v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
+ /* Inform the ZIO pipeline that we are non-rotational */
+ v->vdev_nonrot = blk_queue_nonrot(q);
+
+ /* Physical volume size in bytes for the partition */
+ *psize = bdev_capacity(vd->vd_bdev);
+
+ /* Physical volume size in bytes including possible expansion space */
+ *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
+
+ /* Based on the minimum sector size set the block size */
+ *ashift = highbit64(MAX(block_size, SPA_MINBLOCKSIZE)) - 1;
+
+ /* Try to set the io scheduler elevator algorithm */
+ (void) vdev_elevator_switch(v, zfs_vdev_scheduler);
+
+ return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *v)
+{
+ vdev_disk_t *vd = v->vdev_tsd;
+
+ if (v->vdev_reopening || vd == NULL)
+ return;
+
+ if (vd->vd_bdev != NULL) {
+ vdev_bdev_close(vd->vd_bdev,
+ vdev_bdev_mode(spa_mode(v->vdev_spa)));
+ }
+
+ rw_destroy(&vd->vd_lock);
+ kmem_free(vd, sizeof (vdev_disk_t));
+ v->vdev_tsd = NULL;
+}
+
+static dio_request_t *
+vdev_disk_dio_alloc(int bio_count)
+{
+ dio_request_t *dr;
+ int i;
+
+ dr = kmem_zalloc(sizeof (dio_request_t) +
+ sizeof (struct bio *) * bio_count, KM_SLEEP);
+ if (dr) {
+ atomic_set(&dr->dr_ref, 0);
+ dr->dr_bio_count = bio_count;
+ dr->dr_error = 0;
+
+ for (i = 0; i < dr->dr_bio_count; i++)
+ dr->dr_bio[i] = NULL;
+ }
+
+ return (dr);
+}
+
+static void
+vdev_disk_dio_free(dio_request_t *dr)
+{
+ int i;
+
+ for (i = 0; i < dr->dr_bio_count; i++)
+ if (dr->dr_bio[i])
+ bio_put(dr->dr_bio[i]);
+
+ kmem_free(dr, sizeof (dio_request_t) +
+ sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_disk_dio_get(dio_request_t *dr)
+{
+ atomic_inc(&dr->dr_ref);
+}
+
+static int
+vdev_disk_dio_put(dio_request_t *dr)
+{
+ int rc = atomic_dec_return(&dr->dr_ref);
+
+ /*
+ * Free the dio_request when the last reference is dropped and
+ * ensure zio_interpret is called only once with the correct zio
+ */
+ if (rc == 0) {
+ zio_t *zio = dr->dr_zio;
+ int error = dr->dr_error;
+
+ vdev_disk_dio_free(dr);
+
+ if (zio) {
+ zio->io_error = error;
+ ASSERT3S(zio->io_error, >=, 0);
+ if (zio->io_error)
+ vdev_disk_error(zio);
+
+ zio_delay_interrupt(zio);
+ }
+ }
+
+ return (rc);
+}
+
+BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
+{
+ dio_request_t *dr = bio->bi_private;
+ int rc;
+
+ if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+ if (error)
+ dr->dr_error = -(error);
+ else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ dr->dr_error = EIO;
+#endif
+ }
+
+ /* Drop reference acquired by __vdev_disk_physio */
+ rc = vdev_disk_dio_put(dr);
+}
+
+static unsigned int
+bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size)
+{
+ unsigned int offset, size, i;
+ struct page *page;
+
+ offset = offset_in_page(bio_ptr);
+ for (i = 0; i < bio->bi_max_vecs; i++) {
+ size = PAGE_SIZE - offset;
+
+ if (bio_size <= 0)
+ break;
+
+ if (size > bio_size)
+ size = bio_size;
+
+ if (is_vmalloc_addr(bio_ptr))
+ page = vmalloc_to_page(bio_ptr);
+ else
+ page = virt_to_page(bio_ptr);
+
+ /*
+ * Some network related block device uses tcp_sendpage, which
+ * doesn't behave well when using 0-count page, this is a
+ * safety net to catch them.
+ */
+ ASSERT3S(page_count(page), >, 0);
+
+ if (bio_add_page(bio, page, size, offset) != size)
+ break;
+
+ bio_ptr += size;
+ bio_size -= size;
+ offset = 0;
+ }
+
+ return (bio_size);
+}
+
+static unsigned int
+bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off)
+{
+ if (abd_is_linear(abd))
+ return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size));
+
+ return (abd_scatter_bio_map_off(bio, abd, size, off));
+}
+
+static inline void
+vdev_submit_bio_impl(struct bio *bio)
+{
+#ifdef HAVE_1ARG_SUBMIT_BIO
+ submit_bio(bio);
+#else
+ submit_bio(0, bio);
+#endif
+}
+
+#ifdef HAVE_BIO_SET_DEV
+#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
+/*
+ * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
+ * GPL-only bio_associate_blkg() symbol thus inadvertently converting
+ * the entire macro. Provide a minimal version which always assigns the
+ * request queue's root_blkg to the bio.
+ */
+static inline void
+vdev_bio_associate_blkg(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+
+ ASSERT3P(q, !=, NULL);
+ ASSERT3P(bio->bi_blkg, ==, NULL);
+
+ if (blkg_tryget(q->root_blkg))
+ bio->bi_blkg = q->root_blkg;
+}
+#define bio_associate_blkg vdev_bio_associate_blkg
+#endif
+#else
+/*
+ * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
+ */
+static inline void
+bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+ bio->bi_bdev = bdev;
+}
+#endif /* HAVE_BIO_SET_DEV */
+
+static inline void
+vdev_submit_bio(struct bio *bio)
+{
+#ifdef HAVE_CURRENT_BIO_TAIL
+ struct bio **bio_tail = current->bio_tail;
+ current->bio_tail = NULL;
+ vdev_submit_bio_impl(bio);
+ current->bio_tail = bio_tail;
+#else
+ struct bio_list *bio_list = current->bio_list;
+ current->bio_list = NULL;
+ vdev_submit_bio_impl(bio);
+ current->bio_list = bio_list;
+#endif
+}
+
+static int
+__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+ size_t io_size, uint64_t io_offset, int rw, int flags)
+{
+ dio_request_t *dr;
+ uint64_t abd_offset;
+ uint64_t bio_offset;
+ int bio_size, bio_count = 16;
+ int i = 0, error = 0;
+#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
+ struct blk_plug plug;
+#endif
+ /*
+ * Accessing outside the block device is never allowed.
+ */
+ if (io_offset + io_size > bdev->bd_inode->i_size) {
+ vdev_dbgmsg(zio->io_vd,
+ "Illegal access %llu size %llu, device size %llu",
+ io_offset, io_size, i_size_read(bdev->bd_inode));
+ return (SET_ERROR(EIO));
+ }
+
+retry:
+ dr = vdev_disk_dio_alloc(bio_count);
+ if (dr == NULL)
+ return (SET_ERROR(ENOMEM));
+
+ if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+ bio_set_flags_failfast(bdev, &flags);
+
+ dr->dr_zio = zio;
+
+ /*
+ * When the IO size exceeds the maximum bio size for the request
+ * queue we are forced to break the IO in multiple bio's and wait
+ * for them all to complete. Ideally, all pool users will set
+ * their volume block size to match the maximum request size and
+ * the common case will be one bio per vdev IO request.
+ */
+
+ abd_offset = 0;
+ bio_offset = io_offset;
+ bio_size = io_size;
+ for (i = 0; i <= dr->dr_bio_count; i++) {
+
+ /* Finished constructing bio's for given buffer */
+ if (bio_size <= 0)
+ break;
+
+ /*
+ * By default only 'bio_count' bio's per dio are allowed.
+ * However, if we find ourselves in a situation where more
+ * are needed we allocate a larger dio and warn the user.
+ */
+ if (dr->dr_bio_count == i) {
+ vdev_disk_dio_free(dr);
+ bio_count *= 2;
+ goto retry;
+ }
+
+ /* bio_alloc() with __GFP_WAIT never returns NULL */
+ dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+ MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
+ BIO_MAX_PAGES));
+ if (unlikely(dr->dr_bio[i] == NULL)) {
+ vdev_disk_dio_free(dr);
+ return (SET_ERROR(ENOMEM));
+ }
+
+ /* Matching put called by vdev_disk_physio_completion */
+ vdev_disk_dio_get(dr);
+
+ bio_set_dev(dr->dr_bio[i], bdev);
+ BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
+ dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+ dr->dr_bio[i]->bi_private = dr;
+ bio_set_op_attrs(dr->dr_bio[i], rw, flags);
+
+ /* Remaining size is returned to become the new size */
+ bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd,
+ bio_size, abd_offset);
+
+ /* Advance in buffer and construct another bio if needed */
+ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+ bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+ }
+
+ /* Extra reference to protect dio_request during vdev_submit_bio */
+ vdev_disk_dio_get(dr);
+
+#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
+ if (dr->dr_bio_count > 1)
+ blk_start_plug(&plug);
+#endif
+
+ /* Submit all bio's associated with this dio */
+ for (i = 0; i < dr->dr_bio_count; i++)
+ if (dr->dr_bio[i])
+ vdev_submit_bio(dr->dr_bio[i]);
+
+#if defined(HAVE_BLK_QUEUE_HAVE_BLK_PLUG)
+ if (dr->dr_bio_count > 1)
+ blk_finish_plug(&plug);
+#endif
+
+ (void) vdev_disk_dio_put(dr);
+
+ return (error);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
+{
+ zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+ zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+ zio->io_error = -error;
+#endif
+
+ if (zio->io_error && (zio->io_error == EOPNOTSUPP))
+ zio->io_vd->vdev_nowritecache = B_TRUE;
+
+ bio_put(bio);
+ ASSERT3S(zio->io_error, >=, 0);
+ if (zio->io_error)
+ vdev_disk_error(zio);
+ zio_interrupt(zio);
+}
+
+static int
+vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
+{
+ struct request_queue *q;
+ struct bio *bio;
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return (SET_ERROR(ENXIO));
+
+ bio = bio_alloc(GFP_NOIO, 0);
+ /* bio_alloc() with __GFP_WAIT never returns NULL */
+ if (unlikely(bio == NULL))
+ return (SET_ERROR(ENOMEM));
+
+ bio->bi_end_io = vdev_disk_io_flush_completion;
+ bio->bi_private = zio;
+ bio_set_dev(bio, bdev);
+ bio_set_flush(bio);
+ vdev_submit_bio(bio);
+ invalidate_bdev(bdev);
+
+ return (0);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+ vdev_t *v = zio->io_vd;
+ vdev_disk_t *vd = v->vdev_tsd;
+ unsigned long trim_flags = 0;
+ int rw, flags, error;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (vd == NULL) {
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ rw_enter(&vd->vd_lock, RW_READER);
+
+ /*
+ * If the vdev is closed, it's likely due to a failed reopen and is
+ * in the UNAVAIL state. Nothing to be done here but return failure.
+ */
+ if (vd->vd_bdev == NULL) {
+ rw_exit(&vd->vd_lock);
+ zio->io_error = ENXIO;
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
+
+ if (!vdev_readable(v)) {
+ rw_exit(&vd->vd_lock);
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ if (v->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+
+ error = vdev_disk_io_flush(vd->vd_bdev, zio);
+ if (error == 0) {
+ rw_exit(&vd->vd_lock);
+ return;
+ }
+
+ zio->io_error = error;
+
+ break;
+
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ rw_exit(&vd->vd_lock);
+ zio_execute(zio);
+ return;
+ case ZIO_TYPE_WRITE:
+ rw = WRITE;
+#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
+ flags = (1 << BIO_RW_UNPLUG);
+#elif defined(REQ_UNPLUG)
+ flags = REQ_UNPLUG;
+#else
+ flags = 0;
+#endif
+ break;
+
+ case ZIO_TYPE_READ:
+ rw = READ;
+#if defined(HAVE_BLK_QUEUE_HAVE_BIO_RW_UNPLUG)
+ flags = (1 << BIO_RW_UNPLUG);
+#elif defined(REQ_UNPLUG)
+ flags = REQ_UNPLUG;
+#else
+ flags = 0;
+#endif
+ break;
+
+ case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+ if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+ trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+ zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+ zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+ trim_flags);
+
+ rw_exit(&vd->vd_lock);
+ zio_interrupt(zio);
+ return;
+
+ default:
+ rw_exit(&vd->vd_lock);
+ zio->io_error = SET_ERROR(ENOTSUP);
+ zio_interrupt(zio);
+ return;
+ }
+
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+ error = __vdev_disk_physio(vd->vd_bdev, zio,
+ zio->io_size, zio->io_offset, rw, flags);
+ rw_exit(&vd->vd_lock);
+
+ if (error) {
+ zio->io_error = error;
+ zio_interrupt(zio);
+ return;
+ }
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+ /*
+ * If the device returned EIO, we revalidate the media. If it is
+ * determined the media has changed this triggers the asynchronous
+ * removal of the device from the configuration.
+ */
+ if (zio->io_error == EIO) {
+ vdev_t *v = zio->io_vd;
+ vdev_disk_t *vd = v->vdev_tsd;
+
+ if (check_disk_change(vd->vd_bdev)) {
+ vdev_bdev_invalidate(vd->vd_bdev);
+ v->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+ }
+ }
+}
+
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /* We must have a pathname, and it must be absolute. */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+ return;
+
+ /*
+ * Only prefetch path and devid info if the device has
+ * never been opened.
+ */
+ if (vd->vdev_tsd != NULL)
+ return;
+
+ /* XXX: Implement me as a vnode lookup for the device */
+ vd->vdev_name_vp = NULL;
+ vd->vdev_devid_vp = NULL;
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+ ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+ /* XXX: Implement me as a vnode rele for the device */
+}
+
+static int
+param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+{
+ spa_t *spa = NULL;
+ char *p;
+
+ if (val == NULL)
+ return (SET_ERROR(-EINVAL));
+
+ if ((p = strchr(val, '\n')) != NULL)
+ *p = '\0';
+
+ if (spa_mode_global != 0) {
+ mutex_enter(&spa_namespace_lock);
+ while ((spa = spa_next(spa)) != NULL) {
+ if (spa_state(spa) != POOL_STATE_ACTIVE ||
+ !spa_writeable(spa) || spa_suspended(spa))
+ continue;
+
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ vdev_elevator_switch(spa->spa_root_vdev, (char *)val);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+ }
+ mutex_exit(&spa_namespace_lock);
+ }
+
+ return (param_set_charp(val, kp));
+}
+
+vdev_ops_t vdev_disk_ops = {
+ .vdev_op_open = vdev_disk_open,
+ .vdev_op_close = vdev_disk_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_io_start = vdev_disk_io_start,
+ .vdev_op_io_done = vdev_disk_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_disk_hold,
+ .vdev_op_rele = vdev_disk_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
+ param_get_charp, &zfs_vdev_scheduler, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c
new file mode 100644
index 000000000..b79017f3a
--- /dev/null
+++ b/module/os/linux/zfs/vdev_file.c
@@ -0,0 +1,331 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+ ASSERT(vd->vdev_path != NULL);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+ uint64_t *ashift)
+{
+ vdev_file_t *vf;
+ vnode_t *vp;
+ vattr_t vattr;
+ int error;
+
+ /*
+ * Rotational optimizations only make sense on block devices.
+ */
+ vd->vdev_nonrot = B_TRUE;
+
+ /*
+ * Allow TRIM on file based vdevs. This may not always be supported,
+ * since it depends on your kernel version and underlying filesystem
+ * type but it is always safe to attempt.
+ */
+ vd->vdev_has_trim = B_TRUE;
+
+ /*
+ * Disable secure TRIM on file based vdevs. There is no way to
+ * request this behavior from the underlying filesystem.
+ */
+ vd->vdev_has_securetrim = B_FALSE;
+
+ /*
+ * We must have a pathname, and it must be absolute.
+ */
+ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+ vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if (vd->vdev_tsd != NULL) {
+ ASSERT(vd->vdev_reopening);
+ vf = vd->vdev_tsd;
+ goto skip_open;
+ }
+
+ vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+ /*
+ * We always open the files from the root of the global zone, even if
+ * we're in a local zone. If the user has gotten to this point, the
+ * administrator has already decided that the pool should be available
+ * to local zone users, so the underlying devices should be as well.
+ */
+ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+ error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
+ spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ vf->vf_vnode = vp;
+
+#ifdef _KERNEL
+ /*
+ * Make sure it's a regular file.
+ */
+ if (vp->v_type != VREG) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (SET_ERROR(ENODEV));
+ }
+#endif
+
+skip_open:
+ /*
+ * Determine the physical size of the file.
+ */
+ vattr.va_mask = AT_SIZE;
+ error = VOP_GETATTR(vf->vf_vnode, &vattr, 0, kcred, NULL);
+ if (error) {
+ vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+ return (error);
+ }
+
+ *max_psize = *psize = vattr.va_size;
+ *ashift = SPA_MINBLOCKSHIFT;
+
+ return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (vd->vdev_reopening || vf == NULL)
+ return;
+
+ if (vf->vf_vnode != NULL) {
+ (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
+ (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+ kcred, NULL);
+ }
+
+ vd->vdev_delayed_close = B_FALSE;
+ kmem_free(vf, sizeof (vdev_file_t));
+ vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+ zio_t *zio = (zio_t *)arg;
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+ ssize_t resid;
+ void *buf;
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+ else
+ buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vf->vf_vnode, buf, zio->io_size,
+ zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (zio->io_type == ZIO_TYPE_READ)
+ abd_return_buf_copy(zio->io_abd, buf, zio->io_size);
+ else
+ abd_return_buf(zio->io_abd, buf, zio->io_size);
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = SET_ERROR(ENOSPC);
+
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_fsync(void *arg)
+{
+ zio_t *zio = (zio_t *)arg;
+ vdev_file_t *vf = zio->io_vd->vdev_tsd;
+
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC, kcred, NULL);
+
+ zio_interrupt(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+
+ if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+
+ if (zfs_nocacheflush)
+ break;
+
+ /*
+ * We cannot safely call vfs_fsync() when PF_FSTRANS
+ * is set in the current context. Filesystems like
+ * XFS include sanity checks to verify it is not
+ * already set, see xfs_vm_writepage(). Therefore
+ * the sync must be dispatched to a different context.
+ */
+ if (__spl_pf_fstrans_check()) {
+ VERIFY3U(taskq_dispatch(vdev_file_taskq,
+ vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+ TASKQID_INVALID);
+ return;
+ }
+
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
+ kcred, NULL);
+ break;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
+
+ zio_execute(zio);
+ return;
+ } else if (zio->io_type == ZIO_TYPE_TRIM) {
+ struct flock flck;
+
+ ASSERT3U(zio->io_size, !=, 0);
+ bzero(&flck, sizeof (flck));
+ flck.l_type = F_FREESP;
+ flck.l_start = zio->io_offset;
+ flck.l_len = zio->io_size;
+ flck.l_whence = SEEK_SET;
+
+ zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &flck,
+ 0, 0, kcred, NULL);
+
+ zio_execute(zio);
+ return;
+ }
+
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+ VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+ TQ_SLEEP), !=, TASKQID_INVALID);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+void
+vdev_file_init(void)
+{
+ vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
+ minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
+
+ VERIFY(vdev_file_taskq);
+}
+
+void
+vdev_file_fini(void)
+{
+ taskq_destroy(vdev_file_taskq);
+}
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+ .vdev_op_open = vdev_file_open,
+ .vdev_op_close = vdev_file_close,
+ .vdev_op_asize = vdev_default_asize,
+ .vdev_op_io_start = vdev_file_io_start,
+ .vdev_op_io_done = vdev_file_io_done,
+ .vdev_op_state_change = NULL,
+ .vdev_op_need_resilver = NULL,
+ .vdev_op_hold = vdev_file_hold,
+ .vdev_op_rele = vdev_file_rele,
+ .vdev_op_remap = NULL,
+ .vdev_op_xlate = vdev_default_xlate,
+ .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */
+ .vdev_op_leaf = B_TRUE /* leaf vdev */
+};
+
+#endif
diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c
new file mode 100644
index 000000000..26af91e27
--- /dev/null
+++ b/module/os/linux/zfs/zfs_acl.c
@@ -0,0 +1,2816 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/sid.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/mode.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/trace_acl.h>
+#include <sys/zpl.h>
+
+#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE
+#define DENY ACE_ACCESS_DENIED_ACE_TYPE
+#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define MIN_ACE_TYPE ALLOW
+
+#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+ ACE_DELETE|ACE_DELETE_CHILD)
+#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+ ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+ ZFS_ACL_PROTECTED)
+
+#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+ ZFS_ACL_OBJ_ACE)
+
+#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+ return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+ ((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+ ((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+ return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+ return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+ *datap = NULL;
+ return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+ .ace_mask_get = zfs_ace_v0_get_mask,
+ .ace_mask_set = zfs_ace_v0_set_mask,
+ .ace_flags_get = zfs_ace_v0_get_flags,
+ .ace_flags_set = zfs_ace_v0_set_flags,
+ .ace_type_get = zfs_ace_v0_get_type,
+ .ace_type_set = zfs_ace_v0_set_type,
+ .ace_who_get = zfs_ace_v0_get_who,
+ .ace_who_set = zfs_ace_v0_set_who,
+ .ace_size = zfs_ace_v0_size,
+ .ace_abstract_size = zfs_ace_v0_abstract_size,
+ .ace_mask_off = zfs_ace_v0_mask_off,
+ .ace_data = zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+ return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+ uint16_t entry_type;
+ zfs_ace_t *acep = args;
+
+ entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (-1);
+ return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+ ((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+ ((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+ ((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+ zfs_ace_t *acep = arg;
+
+ uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+ if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return;
+ acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+ zfs_ace_hdr_t *zacep = acep;
+ uint16_t entry_type;
+
+ switch (zacep->z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ return (sizeof (zfs_object_ace_t));
+ case ALLOW:
+ case DENY:
+ entry_type =
+ (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+ if (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE)
+ return (sizeof (zfs_ace_hdr_t));
+ /*FALLTHROUGH*/
+ default:
+ return (sizeof (zfs_ace_t));
+ }
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+ return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+ return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+ zfs_ace_t *zacep = acep;
+ zfs_object_ace_t *zobjp;
+
+ switch (zacep->z_hdr.z_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjp = acep;
+ *datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+ return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+ default:
+ *datap = NULL;
+ return (0);
+ }
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+ .ace_mask_get = zfs_ace_fuid_get_mask,
+ .ace_mask_set = zfs_ace_fuid_set_mask,
+ .ace_flags_get = zfs_ace_fuid_get_flags,
+ .ace_flags_set = zfs_ace_fuid_set_flags,
+ .ace_type_get = zfs_ace_fuid_get_type,
+ .ace_type_set = zfs_ace_fuid_set_type,
+ .ace_who_get = zfs_ace_fuid_get_who,
+ .ace_who_set = zfs_ace_fuid_set_who,
+ .ace_size = zfs_ace_fuid_size,
+ .ace_abstract_size = zfs_ace_fuid_abstract_size,
+ .ace_mask_off = zfs_ace_fuid_mask_off,
+ .ace_data = zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file. Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+ int error;
+
+ if (zp->z_is_sa)
+ return (0);
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_extern_obj);
+ else {
+ /*
+ * after upgrade the SA_ZPL_ZNODE_ACL should have been
+ * removed
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (0);
+ }
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+ zfs_acl_phys_t *aclphys)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t acl_count;
+ int size;
+ int error;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ if (zp->z_is_sa) {
+ if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+ &size)) != 0)
+ return (error);
+ *aclsize = size;
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+ &acl_count, sizeof (acl_count))) != 0)
+ return (error);
+ *aclcount = acl_count;
+ } else {
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ aclphys, sizeof (*aclphys))) != 0)
+ return (error);
+
+ if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+ *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+ *aclcount = aclphys->z_acl_size;
+ } else {
+ *aclsize = aclphys->z_acl_size;
+ *aclcount = aclphys->z_acl_count;
+ }
+ }
+ return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+ zfs_acl_phys_t acl_phys;
+
+ if (zp->z_is_sa)
+ return (ZFS_ACL_VERSION_FUID);
+ else {
+ int error;
+
+ /*
+ * Need to deal with a potential
+ * race where zfs_sa_upgrade could cause
+ * z_isa_sa to change.
+ *
+ * If the lookup fails then the state of z_is_sa should have
+ * changed.
+ */
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+ &acl_phys, sizeof (acl_phys))) == 0)
+ return (acl_phys.z_acl_version);
+ else {
+ /*
+ * After upgrade SA_ZPL_ZNODE_ACL should have
+ * been removed.
+ */
+ VERIFY(zp->z_is_sa && error == ENOENT);
+ return (ZFS_ACL_VERSION_FUID);
+ }
+ }
+}
+
+static int
+zfs_acl_version(int version)
+{
+ if (version < ZPL_VERSION_FUID)
+ return (ZFS_ACL_VERSION_INITIAL);
+ else
+ return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+ return (zfs_acl_version(ZTOZSB(zp)->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+ zfs_acl_t *aclp;
+
+ aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+ list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+ offsetof(zfs_acl_node_t, z_next));
+ aclp->z_version = vers;
+ if (vers == ZFS_ACL_VERSION_FUID)
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ else
+ aclp->z_ops = &zfs_acl_v0_ops;
+ return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+ zfs_acl_node_t *aclnode;
+
+ aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+ if (bytes) {
+ aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+ aclnode->z_allocdata = aclnode->z_acldata;
+ aclnode->z_allocsize = bytes;
+ aclnode->z_size = bytes;
+ }
+
+ return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+ if (aclnode->z_allocsize)
+ kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+ kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+ zfs_acl_node_t *aclnode;
+
+ while ((aclnode = list_head(&aclp->z_acl))) {
+ list_remove(&aclp->z_acl, aclnode);
+ zfs_acl_node_free(aclnode);
+ }
+ aclp->z_acl_count = 0;
+ aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+ zfs_acl_release_nodes(aclp);
+ list_destroy(&aclp->z_acl);
+ kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+ uint16_t entry_type;
+
+ switch (type) {
+ case ALLOW:
+ case DENY:
+ case ACE_SYSTEM_AUDIT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_ACE_TYPE:
+ entry_type = flags & ACE_TYPE_FLAGS;
+ return (entry_type == ACE_OWNER ||
+ entry_type == OWNING_GROUP ||
+ entry_type == ACE_EVERYONE || entry_type == 0 ||
+ entry_type == ACE_IDENTIFIER_GROUP);
+ default:
+ if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+ /*
+ * first check type of entry
+ */
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ return (B_FALSE);
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+ return (B_FALSE);
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ }
+
+ /*
+ * next check inheritance level flags
+ */
+
+ if (S_ISDIR(obj_mode) &&
+ (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+ if ((iflags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+ return (B_FALSE);
+ }
+ }
+
+ return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+ uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+ zfs_acl_node_t *aclnode;
+
+ ASSERT(aclp);
+
+ if (start == NULL) {
+ aclnode = list_head(&aclp->z_acl);
+ if (aclnode == NULL)
+ return (NULL);
+
+ aclp->z_next_ace = aclnode->z_acldata;
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ }
+
+ aclnode = aclp->z_curr_node;
+
+ if (aclnode == NULL)
+ return (NULL);
+
+ if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+ aclnode = list_next(&aclp->z_acl, aclnode);
+ if (aclnode == NULL)
+ return (NULL);
+ else {
+ aclp->z_curr_node = aclnode;
+ aclnode->z_ace_idx = 0;
+ aclp->z_next_ace = aclnode->z_acldata;
+ }
+ }
+
+ if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+ void *acep = aclp->z_next_ace;
+ size_t ace_size;
+
+ /*
+ * Make sure we don't overstep our bounds
+ */
+ ace_size = aclp->z_ops->ace_size(acep);
+
+ if (((caddr_t)acep + ace_size) >
+ ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+ return (NULL);
+ }
+
+ *iflags = aclp->z_ops->ace_flags_get(acep);
+ *type = aclp->z_ops->ace_type_get(acep);
+ *access_mask = aclp->z_ops->ace_mask_get(acep);
+ *who = aclp->z_ops->ace_who_get(acep);
+ aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+ aclnode->z_ace_idx++;
+
+ return ((void *)acep);
+ }
+ return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+ uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+ zfs_acl_t *aclp = datap;
+ zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+ uint64_t who;
+
+ acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+ flags, type);
+ return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp,
+ void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+ zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+ int i;
+ uint16_t entry_type;
+ zfs_ace_t *aceptr = z_acl;
+ ace_t *acep = datap;
+ zfs_object_ace_t *zobjacep;
+ ace_object_t *aceobjp;
+
+ for (i = 0; i != aclcnt; i++) {
+ aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+ aceptr->z_hdr.z_flags = acep->a_flags;
+ aceptr->z_hdr.z_type = acep->a_type;
+ entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+ if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE) {
+ aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+ cr, (entry_type == 0) ?
+ ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+ }
+
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type,
+ aceptr->z_hdr.z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+
+ switch (acep->a_type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ zobjacep = (zfs_object_ace_t *)aceptr;
+ aceobjp = (ace_object_t *)acep;
+
+ bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+ sizeof (aceobjp->a_obj_type));
+ bcopy(aceobjp->a_inherit_obj_type,
+ zobjacep->z_inherit_type,
+ sizeof (aceobjp->a_inherit_obj_type));
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+ break;
+ default:
+ acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+ }
+
+ aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+ aclp->z_ops->ace_size(aceptr));
+ }
+
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+ return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+ void *datap, int filter)
+{
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, type;
+ zfs_ace_hdr_t *zacep = NULL;
+ ace_t *acep = datap;
+ ace_object_t *objacep;
+ zfs_object_ace_t *zobjacep;
+ size_t ace_size;
+ uint16_t entry_type;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ if (filter) {
+ continue;
+ }
+ zobjacep = (zfs_object_ace_t *)zacep;
+ objacep = (ace_object_t *)acep;
+ bcopy(zobjacep->z_object_type,
+ objacep->a_obj_type,
+ sizeof (zobjacep->z_object_type));
+ bcopy(zobjacep->z_inherit_type,
+ objacep->a_inherit_obj_type,
+ sizeof (zobjacep->z_inherit_type));
+ ace_size = sizeof (ace_object_t);
+ break;
+ default:
+ ace_size = sizeof (ace_t);
+ break;
+ }
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ if ((entry_type != ACE_OWNER &&
+ entry_type != OWNING_GROUP &&
+ entry_type != ACE_EVERYONE)) {
+ acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+ cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+ ZFS_ACE_GROUP : ZFS_ACE_USER);
+ } else {
+ acep->a_who = (uid_t)(int64_t)who;
+ }
+ acep->a_access_mask = access_mask;
+ acep->a_flags = iflags;
+ acep->a_type = type;
+ acep = (ace_t *)((caddr_t)acep + ace_size);
+ }
+}
+
+static int
+zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep,
+ zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+ int i;
+ zfs_oldace_t *aceptr = z_acl;
+
+ for (i = 0; i != aclcnt; i++, aceptr++) {
+ aceptr->z_access_mask = acep[i].a_access_mask;
+ aceptr->z_type = acep[i].a_type;
+ aceptr->z_flags = acep[i].a_flags;
+ aceptr->z_fuid = acep[i].a_who;
+ /*
+ * Make sure ACE is valid
+ */
+ if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type,
+ aceptr->z_flags) != B_TRUE)
+ return (SET_ERROR(EINVAL));
+ }
+ *size = (caddr_t)aceptr - (caddr_t)z_acl;
+ return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+ zfs_oldace_t *oldaclp;
+ int i;
+ uint16_t type, iflags;
+ uint32_t access_mask;
+ uint64_t who;
+ void *cookie = NULL;
+ zfs_acl_node_t *newaclnode;
+
+ ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+ /*
+ * First create the ACE in a contiguous piece of memory
+ * for zfs_copy_ace_2_fuid().
+ *
+ * We only convert an ACL once, so this won't happen
+ * every time.
+ */
+ oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+ KM_SLEEP);
+ i = 0;
+ while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+ &access_mask, &iflags, &type))) {
+ oldaclp[i].z_flags = iflags;
+ oldaclp[i].z_type = type;
+ oldaclp[i].z_fuid = who;
+ oldaclp[i++].z_access_mask = access_mask;
+ }
+
+ newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+ sizeof (zfs_object_ace_t));
+ aclp->z_ops = &zfs_acl_fuid_ops;
+ VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode,
+ aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+ &newaclnode->z_size, NULL, cr) == 0);
+ newaclnode->z_ace_count = aclp->z_acl_count;
+ aclp->z_version = ZFS_ACL_VERSION;
+ kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+ /*
+ * Release all previous ACL nodes
+ */
+
+ zfs_acl_release_nodes(aclp);
+
+ list_insert_head(&aclp->z_acl, newaclnode);
+
+ aclp->z_acl_bytes = newaclnode->z_size;
+ aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+ uint32_t new_mask = 0;
+
+ if (access_mask & S_IXOTH)
+ new_mask |= ACE_EXECUTE;
+ if (access_mask & S_IWOTH)
+ new_mask |= ACE_WRITE_DATA;
+ if (access_mask & S_IROTH)
+ new_mask |= ACE_READ_DATA;
+ return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+ uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+ uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+ aclp->z_ops->ace_mask_set(acep, access_mask);
+ aclp->z_ops->ace_type_set(acep, access_type);
+ aclp->z_ops->ace_flags_set(acep, entry_type);
+ if ((type != ACE_OWNER && type != OWNING_GROUP &&
+ type != ACE_EVERYONE))
+ aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ * Also, create FUIDs for any User/Group ACEs
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+ uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+ int entry_type;
+ mode_t mode;
+ mode_t seen = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ uint64_t who;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ boolean_t an_exec_denied = B_FALSE;
+
+ mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+ &access_mask, &iflags, &type))) {
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ /*
+ * Skip over owner@, group@ or everyone@ inherit only ACEs
+ */
+ if ((iflags & ACE_INHERIT_ONLY_ACE) &&
+ (entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ entry_type == OWNING_GROUP))
+ continue;
+
+ if (entry_type == ACE_OWNER || (entry_type == 0 &&
+ who == fuid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRUSR))) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWUSR))) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXUSR))) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ } else if (entry_type == OWNING_GROUP ||
+ (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+ if ((access_mask & ACE_READ_DATA) &&
+ (!(seen & S_IRGRP))) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA) &&
+ (!(seen & S_IWGRP))) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if ((access_mask & ACE_EXECUTE) &&
+ (!(seen & S_IXGRP))) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ } else if (entry_type == ACE_EVERYONE) {
+ if ((access_mask & ACE_READ_DATA)) {
+ if (!(seen & S_IRUSR)) {
+ seen |= S_IRUSR;
+ if (type == ALLOW) {
+ mode |= S_IRUSR;
+ }
+ }
+ if (!(seen & S_IRGRP)) {
+ seen |= S_IRGRP;
+ if (type == ALLOW) {
+ mode |= S_IRGRP;
+ }
+ }
+ if (!(seen & S_IROTH)) {
+ seen |= S_IROTH;
+ if (type == ALLOW) {
+ mode |= S_IROTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_WRITE_DATA)) {
+ if (!(seen & S_IWUSR)) {
+ seen |= S_IWUSR;
+ if (type == ALLOW) {
+ mode |= S_IWUSR;
+ }
+ }
+ if (!(seen & S_IWGRP)) {
+ seen |= S_IWGRP;
+ if (type == ALLOW) {
+ mode |= S_IWGRP;
+ }
+ }
+ if (!(seen & S_IWOTH)) {
+ seen |= S_IWOTH;
+ if (type == ALLOW) {
+ mode |= S_IWOTH;
+ }
+ }
+ }
+ if ((access_mask & ACE_EXECUTE)) {
+ if (!(seen & S_IXUSR)) {
+ seen |= S_IXUSR;
+ if (type == ALLOW) {
+ mode |= S_IXUSR;
+ }
+ }
+ if (!(seen & S_IXGRP)) {
+ seen |= S_IXGRP;
+ if (type == ALLOW) {
+ mode |= S_IXGRP;
+ }
+ }
+ if (!(seen & S_IXOTH)) {
+ seen |= S_IXOTH;
+ if (type == ALLOW) {
+ mode |= S_IXOTH;
+ }
+ }
+ }
+ } else {
+ /*
+ * Only care if this IDENTIFIER_GROUP or
+ * USER ACE denies execute access to someone,
+ * mode is not affected
+ */
+ if ((access_mask & ACE_EXECUTE) && type == DENY)
+ an_exec_denied = B_TRUE;
+ }
+ }
+
+ /*
+ * Failure to allow is effectively a deny, so execute permission
+ * is denied if it was never mentioned or if we explicitly
+ * weren't allowed it.
+ */
+ if (!an_exec_denied &&
+ ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+ (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+ an_exec_denied = B_TRUE;
+
+ if (an_exec_denied)
+ *pflags &= ~ZFS_NO_EXECS_DENIED;
+ else
+ *pflags |= ZFS_NO_EXECS_DENIED;
+
+ return (mode);
+}
+
+/*
+ * Read an external acl object. If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+ boolean_t will_modify)
+{
+ zfs_acl_t *aclp;
+ int aclsize = 0;
+ int acl_count = 0;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_phys_t znode_acl;
+ int version;
+ int error;
+ boolean_t drop_lock = B_FALSE;
+
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ if (zp->z_acl_cached && !will_modify) {
+ *aclpp = zp->z_acl_cached;
+ return (0);
+ }
+
+ /*
+ * close race where znode could be upgrade while trying to
+ * read the znode attributes.
+ *
+ * But this could only happen if the file isn't already an SA
+ * znode
+ */
+ if (!zp->z_is_sa && !have_lock) {
+ mutex_enter(&zp->z_lock);
+ drop_lock = B_TRUE;
+ }
+ version = zfs_znode_acl_version(zp);
+
+ if ((error = zfs_acl_znode_info(zp, &aclsize,
+ &acl_count, &znode_acl)) != 0) {
+ goto done;
+ }
+
+ aclp = zfs_acl_alloc(version);
+
+ aclp->z_acl_count = acl_count;
+ aclp->z_acl_bytes = aclsize;
+
+ aclnode = zfs_acl_node_alloc(aclsize);
+ aclnode->z_ace_count = aclp->z_acl_count;
+ aclnode->z_size = aclsize;
+
+ if (!zp->z_is_sa) {
+ if (znode_acl.z_acl_extern_obj) {
+ error = dmu_read(ZTOZSB(zp)->z_os,
+ znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+ aclnode->z_acldata, DMU_READ_PREFETCH);
+ } else {
+ bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+ aclnode->z_size);
+ }
+ } else {
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)),
+ aclnode->z_acldata, aclnode->z_size);
+ }
+
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ goto done;
+ }
+
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ *aclpp = aclp;
+ if (!will_modify)
+ zp->z_acl_cached = aclp;
+done:
+ if (drop_lock)
+ mutex_exit(&zp->z_lock);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+ boolean_t start, void *userdata)
+{
+ zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+ if (start) {
+ cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+ } else {
+ cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+ cb->cb_acl_node);
+ }
+ *dataptr = cb->cb_acl_node->z_acldata;
+ *length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+ int error;
+ zfs_acl_t *aclp;
+
+ if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL)
+ return (0);
+
+ ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+ error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+ if (error == 0 && aclp->z_acl_count > 0)
+ zp->z_mode = ZTOI(zp)->i_mode =
+ zfs_mode_compute(zp->z_mode, aclp,
+ &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid),
+ KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+ /*
+ * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL
+ * nor a DACL_ACES SA in which case ENOENT is returned from
+ * zfs_acl_node_read() when the SA can't be located.
+ * Allow chown/chgrp to succeed in these cases rather than
+ * returning an error that makes no sense in the context of
+ * the caller.
+ */
+ if (error == ENOENT)
+ return (0);
+
+ return (error);
+}
+
+static void
+acl_trivial_access_masks(mode_t mode, uint32_t *allow0, uint32_t *deny1,
+ uint32_t *deny2, uint32_t *owner, uint32_t *group, uint32_t *everyone)
+{
+ *deny1 = *deny2 = *allow0 = *group = 0;
+
+ if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+ *deny1 |= ACE_READ_DATA;
+ if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+ *deny1 |= ACE_WRITE_DATA;
+ if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+ *deny1 |= ACE_EXECUTE;
+
+ if (!(mode & S_IRGRP) && (mode & S_IROTH))
+ *deny2 = ACE_READ_DATA;
+ if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+ *deny2 |= ACE_WRITE_DATA;
+ if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+ *deny2 |= ACE_EXECUTE;
+
+ if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+ *allow0 |= ACE_READ_DATA;
+ if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+ *allow0 |= ACE_WRITE_DATA;
+ if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+ *allow0 |= ACE_EXECUTE;
+
+ *owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+ ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+ if (mode & S_IRUSR)
+ *owner |= ACE_READ_DATA;
+ if (mode & S_IWUSR)
+ *owner |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXUSR)
+ *owner |= ACE_EXECUTE;
+
+ *group = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IRGRP)
+ *group |= ACE_READ_DATA;
+ if (mode & S_IWGRP)
+ *group |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXGRP)
+ *group |= ACE_EXECUTE;
+
+ *everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES| ACE_READ_NAMED_ATTRS|
+ ACE_SYNCHRONIZE;
+ if (mode & S_IROTH)
+ *everyone |= ACE_READ_DATA;
+ if (mode & S_IWOTH)
+ *everyone |= ACE_WRITE_DATA|ACE_APPEND_DATA;
+ if (mode & S_IXOTH)
+ *everyone |= ACE_EXECUTE;
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries. ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+static int
+ace_trivial_common(void *acep, int aclcnt,
+ uint64_t (*walk)(void *, uint64_t, int aclcnt,
+ uint16_t *, uint16_t *, uint32_t *))
+{
+ uint16_t flags;
+ uint32_t mask;
+ uint16_t type;
+ uint64_t cookie = 0;
+
+ while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+ switch (flags & ACE_TYPE_FLAGS) {
+ case ACE_OWNER:
+ case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+ case ACE_EVERYONE:
+ break;
+ default:
+ return (1);
+ }
+
+ if (flags & (ACE_FILE_INHERIT_ACE|
+ ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+ ACE_INHERIT_ONLY_ACE))
+ return (1);
+
+ /*
+ * Special check for some special bits
+ *
+ * Don't allow anybody to deny reading basic
+ * attributes or a files ACL.
+ */
+ if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ (type == ACE_ACCESS_DENIED_ACE_TYPE))
+ return (1);
+
+ /*
+ * Delete permissions are never set by default
+ */
+ if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+ return (1);
+ /*
+ * only allow owner@ to have
+ * write_acl/write_owner/write_attributes/write_xattr/
+ */
+ if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+ (!(flags & ACE_OWNER) && (mask &
+ (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+ ACE_WRITE_NAMED_ATTRS))))
+ return (1);
+
+ }
+
+ return (0);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+ int error;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_object_type_t otype;
+ zfs_acl_locator_cb_t locate = { 0 };
+ uint64_t mode;
+ sa_bulk_attr_t bulk[5];
+ uint64_t ctime[2];
+ int count = 0;
+ zfs_acl_phys_t acl_phys;
+
+ mode = zp->z_mode;
+
+ mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+ KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+ zp->z_mode = ZTOI(zp)->i_mode = mode;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, sizeof (ctime));
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ /*
+ * Upgrade needed?
+ */
+ if (!zfsvfs->z_use_fuids) {
+ otype = DMU_OT_OLDACL;
+ } else {
+ if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+ (zfsvfs->z_version >= ZPL_VERSION_FUID))
+ zfs_acl_xform(zp, aclp, cr);
+ ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+ otype = DMU_OT_ACL;
+ }
+
+ /*
+ * Arrgh, we have to handle old on disk format
+ * as well as newer (preferred) SA format.
+ */
+
+ if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+ locate.cb_aclp = aclp;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+ NULL, &aclp->z_acl_count, sizeof (uint64_t));
+ } else { /* Painful legacy way */
+ zfs_acl_node_t *aclnode;
+ uint64_t off = 0;
+ uint64_t aoid;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+ &acl_phys, sizeof (acl_phys))) != 0)
+ return (error);
+
+ aoid = acl_phys.z_acl_extern_obj;
+
+ if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ /*
+ * If ACL was previously external and we are now
+ * converting to new ACL format then release old
+ * ACL object and create a new one.
+ */
+ if (aoid &&
+ aclp->z_version != acl_phys.z_acl_version) {
+ error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+ if (error)
+ return (error);
+ aoid = 0;
+ }
+ if (aoid == 0) {
+ aoid = dmu_object_alloc(zfsvfs->z_os,
+ otype, aclp->z_acl_bytes,
+ otype == DMU_OT_ACL ?
+ DMU_OT_SYSACL : DMU_OT_NONE,
+ otype == DMU_OT_ACL ?
+ DN_OLD_MAX_BONUSLEN : 0, tx);
+ } else {
+ (void) dmu_object_set_blocksize(zfsvfs->z_os,
+ aoid, aclp->z_acl_bytes, 0, tx);
+ }
+ acl_phys.z_acl_extern_obj = aoid;
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ dmu_write(zfsvfs->z_os, aoid, off,
+ aclnode->z_size, aclnode->z_acldata, tx);
+ off += aclnode->z_size;
+ }
+ } else {
+ void *start = acl_phys.z_ace_data;
+ /*
+ * Migrating back embedded?
+ */
+ if (acl_phys.z_acl_extern_obj) {
+ error = dmu_object_free(zfsvfs->z_os,
+ acl_phys.z_acl_extern_obj, tx);
+ if (error)
+ return (error);
+ acl_phys.z_acl_extern_obj = 0;
+ }
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ if (aclnode->z_ace_count == 0)
+ continue;
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ }
+ /*
+ * If Old version then swap count/bytes to match old
+ * layout of znode_acl_phys_t.
+ */
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ acl_phys.z_acl_size = aclp->z_acl_count;
+ acl_phys.z_acl_count = aclp->z_acl_bytes;
+ } else {
+ acl_phys.z_acl_size = aclp->z_acl_bytes;
+ acl_phys.z_acl_count = aclp->z_acl_count;
+ }
+ acl_phys.z_acl_version = aclp->z_version;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (acl_phys));
+ }
+
+ /*
+ * Replace ACL wide bits, but first clear them.
+ */
+ zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+ zp->z_pflags |= aclp->z_hints;
+
+ if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+ zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+ return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t mode, zfs_acl_t *aclp)
+{
+ void *acep = NULL;
+ uint64_t who;
+ int new_count, new_bytes;
+ int ace_size;
+ int entry_type;
+ uint16_t iflags, type;
+ uint32_t access_mask;
+ zfs_acl_node_t *newnode;
+ size_t abstract_size = aclp->z_ops->ace_abstract_size();
+ void *zacep;
+ uint32_t owner, group, everyone;
+ uint32_t deny1, deny2, allow0;
+
+ new_count = new_bytes = 0;
+
+ acl_trivial_access_masks((mode_t)mode, &allow0, &deny1, &deny2,
+ &owner, &group, &everyone);
+
+ newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+ zacep = newnode->z_acldata;
+ if (allow0) {
+ zfs_set_ace(aclp, zacep, allow0, ALLOW, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (deny1) {
+ zfs_set_ace(aclp, zacep, deny1, DENY, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+ if (deny2) {
+ zfs_set_ace(aclp, zacep, deny2, DENY, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ new_count++;
+ new_bytes += abstract_size;
+ }
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ uint16_t inherit_flags;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+ inherit_flags = (iflags & ALL_INHERIT);
+
+ if ((entry_type == ACE_OWNER || entry_type == ACE_EVERYONE ||
+ (entry_type == OWNING_GROUP)) &&
+ ((inherit_flags & ACE_INHERIT_ONLY_ACE) == 0)) {
+ continue;
+ }
+
+ if ((type != ALLOW && type != DENY) ||
+ (inherit_flags & ACE_INHERIT_ONLY_ACE)) {
+ if (inherit_flags)
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+ break;
+ }
+ } else {
+
+ /*
+ * Limit permissions to be no greater than
+ * group permissions
+ */
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) {
+ if (!(mode & S_IRGRP))
+ access_mask &= ~ACE_READ_DATA;
+ if (!(mode & S_IWGRP))
+ access_mask &=
+ ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ if (!(mode & S_IXGRP))
+ access_mask &= ~ACE_EXECUTE;
+ access_mask &=
+ ~(ACE_WRITE_OWNER|ACE_WRITE_ACL|
+ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS);
+ }
+ }
+ zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+ ace_size = aclp->z_ops->ace_size(acep);
+ zacep = (void *)((uintptr_t)zacep + ace_size);
+ new_count++;
+ new_bytes += ace_size;
+ }
+ zfs_set_ace(aclp, zacep, owner, 0, -1, ACE_OWNER);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, group, 0, -1, OWNING_GROUP);
+ zacep = (void *)((uintptr_t)zacep + abstract_size);
+ zfs_set_ace(aclp, zacep, everyone, 0, -1, ACE_EVERYONE);
+
+ new_count += 3;
+ new_bytes += abstract_size * 3;
+ zfs_acl_release_nodes(aclp);
+ aclp->z_acl_count = new_count;
+ aclp->z_acl_bytes = new_bytes;
+ newnode->z_ace_count = new_count;
+ newnode->z_size = new_bytes;
+ list_insert_tail(&aclp->z_acl, newnode);
+}
+
+void
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+ mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+ *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+ (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+ zfs_acl_chmod(ZTOZSB(zp), mode, *aclp);
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+ ASSERT(*aclp);
+}
+
+/*
+ * strip off write_owner and write_acl
+ */
+static void
+zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
+{
+ uint32_t mask = aclp->z_ops->ace_mask_get(acep);
+
+ if ((zfsvfs->z_acl_inherit == ZFS_ACL_RESTRICTED) &&
+ (aclp->z_ops->ace_type_get(acep) == ALLOW)) {
+ mask &= ~RESTRICTED_CLEAR;
+ aclp->z_ops->ace_mask_set(acep, mask);
+ }
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags)
+{
+ int iflags = (acep_flags & 0xf);
+
+ if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+ return (1);
+ else if (iflags & ACE_FILE_INHERIT_ACE)
+ return (!(S_ISDIR(obj_mode) &&
+ (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+ return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *paclp,
+ uint64_t mode, boolean_t *need_chmod)
+{
+ void *pacep;
+ void *acep;
+ zfs_acl_node_t *aclnode;
+ zfs_acl_t *aclp = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t iflags, newflags, type;
+ size_t ace_size;
+ void *data1, *data2;
+ size_t data1sz, data2sz;
+ boolean_t vdir = S_ISDIR(obj_mode);
+ boolean_t vreg = S_ISREG(obj_mode);
+ boolean_t passthrough, passthrough_x, noallow;
+
+ passthrough_x =
+ zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH_X;
+ passthrough = passthrough_x ||
+ zfsvfs->z_acl_inherit == ZFS_ACL_PASSTHROUGH;
+ noallow =
+ zfsvfs->z_acl_inherit == ZFS_ACL_NOALLOW;
+
+ *need_chmod = B_TRUE;
+ pacep = NULL;
+ aclp = zfs_acl_alloc(paclp->z_version);
+ if (zfsvfs->z_acl_inherit == ZFS_ACL_DISCARD || S_ISLNK(obj_mode))
+ return (aclp);
+ while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+ &access_mask, &iflags, &type))) {
+
+ /*
+ * don't inherit bogus ACEs
+ */
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ if (noallow && type == ALLOW)
+ continue;
+
+ ace_size = aclp->z_ops->ace_size(pacep);
+
+ if (!zfs_ace_can_use(obj_mode, iflags))
+ continue;
+
+ /*
+ * If owner@, group@, or everyone@ inheritable
+ * then zfs_acl_chmod() isn't needed.
+ */
+ if (passthrough &&
+ ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+ ((iflags & OWNING_GROUP) ==
+ OWNING_GROUP)) && (vreg || (vdir && (iflags &
+ ACE_DIRECTORY_INHERIT_ACE)))) {
+ *need_chmod = B_FALSE;
+ }
+
+ if (!vdir && passthrough_x &&
+ ((mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)) {
+ access_mask &= ~ACE_EXECUTE;
+ }
+
+ aclnode = zfs_acl_node_alloc(ace_size);
+ list_insert_tail(&aclp->z_acl, aclnode);
+ acep = aclnode->z_acldata;
+
+ zfs_set_ace(aclp, acep, access_mask, type,
+ who, iflags|ACE_INHERITED_ACE);
+
+ /*
+ * Copy special opaque data if any
+ */
+ if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+ VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+ &data2)) == data1sz);
+ bcopy(data1, data2, data2sz);
+ }
+
+ aclp->z_acl_count++;
+ aclnode->z_ace_count++;
+ aclp->z_acl_bytes += aclnode->z_size;
+ newflags = aclp->z_ops->ace_flags_get(acep);
+
+ if (vdir)
+ aclp->z_hints |= ZFS_INHERIT_ACE;
+
+ if ((iflags & ACE_NO_PROPAGATE_INHERIT_ACE) || !vdir) {
+ newflags &= ~ALL_INHERIT;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ zfs_restricted_update(zfsvfs, aclp, acep);
+ continue;
+ }
+
+ ASSERT(vdir);
+
+ /*
+ * If only FILE_INHERIT is set then turn on
+ * inherit_only
+ */
+ if ((iflags & (ACE_FILE_INHERIT_ACE |
+ ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+ newflags |= ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ } else {
+ newflags &= ~ACE_INHERIT_ONLY_ACE;
+ aclp->z_ops->ace_flags_set(acep,
+ newflags|ACE_INHERITED_ACE);
+ }
+ }
+ return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+ vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+ int error;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zfs_acl_t *paclp;
+ gid_t gid = vap->va_gid;
+ boolean_t need_chmod = B_TRUE;
+ boolean_t inherited = B_FALSE;
+
+ bzero(acl_ids, sizeof (zfs_acl_ids_t));
+ acl_ids->z_mode = vap->va_mode;
+
+ if (vsecp)
+ if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp,
+ cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+ return (error);
+
+ acl_ids->z_fuid = vap->va_uid;
+ acl_ids->z_fgid = vap->va_gid;
+#ifdef HAVE_KSID
+ /*
+ * Determine uid and gid.
+ */
+ if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+ ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) {
+ acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid,
+ cr, ZFS_OWNER, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ } else {
+ acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+ cr, &acl_ids->z_fuidp);
+ acl_ids->z_fgid = 0;
+ if (vap->va_mask & AT_GID) {
+ acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid,
+ cr, ZFS_GROUP, &acl_ids->z_fuidp);
+ gid = vap->va_gid;
+ if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) &&
+ !groupmember(vap->va_gid, cr) &&
+ secpolicy_vnode_create_gid(cr) != 0)
+ acl_ids->z_fgid = 0;
+ }
+ if (acl_ids->z_fgid == 0) {
+ if (dzp->z_mode & S_ISGID) {
+ char *domain;
+ uint32_t rid;
+
+ acl_ids->z_fgid = KGID_TO_SGID(
+ ZTOI(dzp)->i_gid);
+ gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+ cr, ZFS_GROUP);
+
+ if (zfsvfs->z_use_fuids &&
+ IS_EPHEMERAL(acl_ids->z_fgid)) {
+ domain = zfs_fuid_idx_domain(
+ &zfsvfs->z_fuid_idx,
+ FUID_INDEX(acl_ids->z_fgid));
+ rid = FUID_RID(acl_ids->z_fgid);
+ zfs_fuid_node_add(&acl_ids->z_fuidp,
+ domain, rid,
+ FUID_INDEX(acl_ids->z_fgid),
+ acl_ids->z_fgid, ZFS_GROUP);
+ }
+ } else {
+ acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+ ZFS_GROUP, cr, &acl_ids->z_fuidp);
+ gid = crgetgid(cr);
+ }
+ }
+ }
+#endif /* HAVE_KSID */
+
+ /*
+ * If we're creating a directory, and the parent directory has the
+ * set-GID bit set, set in on the new directory.
+ * Otherwise, if the user is neither privileged nor a member of the
+ * file's new group, clear the file's set-GID bit.
+ */
+
+ if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+ (S_ISDIR(vap->va_mode))) {
+ acl_ids->z_mode |= S_ISGID;
+ } else {
+ if ((acl_ids->z_mode & S_ISGID) &&
+ secpolicy_vnode_setids_setgids(cr, gid) != 0)
+ acl_ids->z_mode &= ~S_ISGID;
+ }
+
+ if (acl_ids->z_aclp == NULL) {
+ mutex_enter(&dzp->z_acl_lock);
+ mutex_enter(&dzp->z_lock);
+ if (!(flag & IS_ROOT_NODE) && (S_ISDIR(ZTOI(dzp)->i_mode) &&
+ (dzp->z_pflags & ZFS_INHERIT_ACE)) &&
+ !(dzp->z_pflags & ZFS_XATTR)) {
+ VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+ &paclp, B_FALSE));
+ acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+ vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
+ inherited = B_TRUE;
+ } else {
+ acl_ids->z_aclp =
+ zfs_acl_alloc(zfs_acl_version_zp(dzp));
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+ mutex_exit(&dzp->z_lock);
+ mutex_exit(&dzp->z_acl_lock);
+ if (need_chmod) {
+ acl_ids->z_aclp->z_hints |= S_ISDIR(vap->va_mode) ?
+ ZFS_ACL_AUTO_INHERIT : 0;
+ zfs_acl_chmod(zfsvfs, acl_ids->z_mode, acl_ids->z_aclp);
+ }
+ }
+
+ if (inherited || vsecp) {
+ acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+ acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+ acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+ }
+
+ return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+ if (acl_ids->z_aclp)
+ zfs_acl_free(acl_ids->z_aclp);
+ if (acl_ids->z_fuidp)
+ zfs_fuid_info_free(acl_ids->z_fuidp);
+ acl_ids->z_aclp = NULL;
+ acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+ return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+ zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+ (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfs_acl_t *aclp;
+ ulong_t mask;
+ int error;
+ int count = 0;
+ int largeace = 0;
+
+ mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+ VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ /*
+ * Scan ACL to determine number of ACEs
+ */
+ if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+ void *zacep = NULL;
+ uint64_t who;
+ uint32_t access_mask;
+ uint16_t type, iflags;
+
+ while ((zacep = zfs_acl_next_ace(aclp, zacep,
+ &who, &access_mask, &iflags, &type))) {
+ switch (type) {
+ case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+ case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+ case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+ largeace++;
+ continue;
+ default:
+ count++;
+ }
+ }
+ vsecp->vsa_aclcnt = count;
+ } else
+ count = (int)aclp->z_acl_count;
+
+ if (mask & VSA_ACECNT) {
+ vsecp->vsa_aclcnt = count;
+ }
+
+ if (mask & VSA_ACE) {
+ size_t aclsz;
+
+ aclsz = count * sizeof (ace_t) +
+ sizeof (ace_object_t) * largeace;
+
+ vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+ vsecp->vsa_aclentsz = aclsz;
+
+ if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+ zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr,
+ vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+ else {
+ zfs_acl_node_t *aclnode;
+ void *start = vsecp->vsa_aclentp;
+
+ for (aclnode = list_head(&aclp->z_acl); aclnode;
+ aclnode = list_next(&aclp->z_acl, aclnode)) {
+ bcopy(aclnode->z_acldata, start,
+ aclnode->z_size);
+ start = (caddr_t)start + aclnode->z_size;
+ }
+ ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+ aclp->z_acl_bytes);
+ }
+ }
+ if (mask & VSA_ACE_ACLFLAGS) {
+ vsecp->vsa_aclflags = 0;
+ if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+ vsecp->vsa_aclflags |= ACL_DEFAULTED;
+ if (zp->z_pflags & ZFS_ACL_PROTECTED)
+ vsecp->vsa_aclflags |= ACL_PROTECTED;
+ if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+ vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode,
+ vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+ zfs_acl_t *aclp;
+ zfs_acl_node_t *aclnode;
+ int aclcnt = vsecp->vsa_aclcnt;
+ int error;
+
+ if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+ return (SET_ERROR(EINVAL));
+
+ aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+ aclp->z_hints = 0;
+ aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+ if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+ if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp,
+ (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+ aclcnt, &aclnode->z_size)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ } else {
+ if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp,
+ vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+ &aclnode->z_size, fuidp, cr)) != 0) {
+ zfs_acl_free(aclp);
+ zfs_acl_node_free(aclnode);
+ return (error);
+ }
+ }
+ aclp->z_acl_bytes = aclnode->z_size;
+ aclnode->z_ace_count = aclcnt;
+ aclp->z_acl_count = aclcnt;
+ list_insert_head(&aclp->z_acl, aclnode);
+
+ /*
+ * If flags are being set then add them to z_hints
+ */
+ if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+ if (vsecp->vsa_aclflags & ACL_PROTECTED)
+ aclp->z_hints |= ZFS_ACL_PROTECTED;
+ if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+ aclp->z_hints |= ZFS_ACL_DEFAULTED;
+ if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+ aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+ }
+
+ *zaclp = aclp;
+
+ return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zilog_t *zilog = zfsvfs->z_log;
+ ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_t *aclp;
+ zfs_fuid_info_t *fuidp = NULL;
+ boolean_t fuid_dirtied;
+ uint64_t acl_obj;
+
+ if (mask == 0)
+ return (SET_ERROR(ENOSYS));
+
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ return (SET_ERROR(EPERM));
+
+ if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+ return (error);
+
+ error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp,
+ &aclp);
+ if (error)
+ return (error);
+
+ /*
+ * If ACL wide flags aren't being set then preserve any
+ * existing flags.
+ */
+ if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+ aclp->z_hints |=
+ (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+ }
+top:
+ mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ /*
+ * If old version and ACL won't fit in bonus and we aren't
+ * upgrading then take out necessary DMU holds
+ */
+
+ if ((acl_obj = zfs_external_acl(zp)) != 0) {
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+ }
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (error) {
+ mutex_exit(&zp->z_acl_lock);
+ mutex_exit(&zp->z_lock);
+
+ if (error == ERESTART) {
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ zfs_acl_free(aclp);
+ return (error);
+ }
+
+ error = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT(error == 0);
+ ASSERT(zp->z_acl_cached == NULL);
+ zp->z_acl_cached = aclp;
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+ if (fuidp)
+ zfs_fuid_info_free(fuidp);
+ dmu_tx_commit(tx);
+
+ mutex_exit(&zp->z_lock);
+ mutex_exit(&zp->z_acl_lock);
+
+ return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only. Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+ if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
+ (!S_ISDEV(ZTOI(zp)->i_mode) ||
+ (S_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * Only check for READONLY on non-directories.
+ */
+ if ((v4_mode & WRITE_MASK_DATA) &&
+ ((!S_ISDIR(ZTOI(zp)->i_mode) &&
+ (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+ (S_ISDIR(ZTOI(zp)->i_mode) &&
+ (zp->z_pflags & ZFS_IMMUTABLE)))) {
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
+ (zp->z_pflags & ZFS_NOUNLINK)) {
+ return (SET_ERROR(EPERM));
+ }
+
+ if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+ return (SET_ERROR(EACCES));
+ }
+
+ return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied. The AoI are expressed as bits in
+ * the working_mode parameter. As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode. This removal
+ * facilitates two things. The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE. Removing the
+ * discovered access or denial enforces this rule. At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses. Returns:
+ * 0 if all AoI granted
+ * EACCES if the denied mask is non-zero
+ * other error if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted. If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE. The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+ boolean_t anyaccess, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zfs_acl_t *aclp;
+ int error;
+ uid_t uid = crgetuid(cr);
+ uint64_t who;
+ uint16_t type, iflags;
+ uint16_t entry_type;
+ uint32_t access_mask;
+ uint32_t deny_mask = 0;
+ zfs_ace_hdr_t *acep = NULL;
+ boolean_t checkit;
+ uid_t gowner;
+ uid_t fowner;
+
+ zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+ mutex_enter(&zp->z_acl_lock);
+
+ error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
+ ASSERT(zp->z_acl_cached);
+
+ while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+ &iflags, &type))) {
+ uint32_t mask_matched;
+
+ if (!zfs_acl_valid_ace_type(type, iflags))
+ continue;
+
+ if (S_ISDIR(ZTOI(zp)->i_mode) &&
+ (iflags & ACE_INHERIT_ONLY_ACE))
+ continue;
+
+ /* Skip ACE if it does not affect any AoI */
+ mask_matched = (access_mask & *working_mode);
+ if (!mask_matched)
+ continue;
+
+ entry_type = (iflags & ACE_TYPE_FLAGS);
+
+ checkit = B_FALSE;
+
+ switch (entry_type) {
+ case ACE_OWNER:
+ if (uid == fowner)
+ checkit = B_TRUE;
+ break;
+ case OWNING_GROUP:
+ who = gowner;
+ /*FALLTHROUGH*/
+ case ACE_IDENTIFIER_GROUP:
+ checkit = zfs_groupmember(zfsvfs, who, cr);
+ break;
+ case ACE_EVERYONE:
+ checkit = B_TRUE;
+ break;
+
+ /* USER Entry */
+ default:
+ if (entry_type == 0) {
+ uid_t newid;
+
+ newid = zfs_fuid_map_id(zfsvfs, who, cr,
+ ZFS_ACE_USER);
+ if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
+ uid == newid)
+ checkit = B_TRUE;
+ break;
+ } else {
+ mutex_exit(&zp->z_acl_lock);
+ return (SET_ERROR(EIO));
+ }
+ }
+
+ if (checkit) {
+ if (type == DENY) {
+ DTRACE_PROBE3(zfs__ace__denies,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ deny_mask |= mask_matched;
+ } else {
+ DTRACE_PROBE3(zfs__ace__allows,
+ znode_t *, zp,
+ zfs_ace_hdr_t *, acep,
+ uint32_t, mask_matched);
+ if (anyaccess) {
+ mutex_exit(&zp->z_acl_lock);
+ return (0);
+ }
+ }
+ *working_mode &= ~mask_matched;
+ }
+
+ /* Are we done? */
+ if (*working_mode == 0)
+ break;
+ }
+
+ mutex_exit(&zp->z_acl_lock);
+
+ /* Put the found 'denies' back on the working mode */
+ if (deny_mask) {
+ *working_mode |= deny_mask;
+ return (SET_ERROR(EACCES));
+ } else if (*working_mode) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+ uint32_t have = ACE_ALL_PERMS;
+
+ if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+ uid_t owner;
+
+ owner = zfs_fuid_map_id(ZTOZSB(zp),
+ KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER);
+ return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0);
+ }
+ return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+ boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int err;
+
+ *working_mode = v4_mode;
+ *check_privs = B_TRUE;
+
+ /*
+ * Short circuit empty requests
+ */
+ if (v4_mode == 0 || zfsvfs->z_replay) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+ *check_privs = B_FALSE;
+ return (err);
+ }
+
+ /*
+ * The caller requested that the ACL check be skipped. This
+ * would only happen if the caller checked VOP_ACCESS() with a
+ * 32 bit ACE mask and already had the appropriate permissions.
+ */
+ if (skipaclchk) {
+ *working_mode = 0;
+ return (0);
+ }
+
+ return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+ cred_t *cr)
+{
+ if (*working_mode != ACE_WRITE_DATA)
+ return (SET_ERROR(EACCES));
+
+ return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+ check_privs, B_FALSE, cr));
+}
+
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+ boolean_t owner = B_FALSE;
+ boolean_t groupmbr = B_FALSE;
+ boolean_t is_attr;
+ uid_t uid = crgetuid(cr);
+ int error;
+
+ if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+ (S_ISDIR(ZTOI(zdp)->i_mode)));
+ if (is_attr)
+ goto slow;
+
+
+ mutex_enter(&zdp->z_acl_lock);
+
+ if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+
+ if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 ||
+ KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+
+ if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) {
+ owner = B_TRUE;
+ if (zdp->z_mode & S_IXUSR) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) {
+ groupmbr = B_TRUE;
+ if (zdp->z_mode & S_IXGRP) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ } else {
+ mutex_exit(&zdp->z_acl_lock);
+ goto slow;
+ }
+ }
+ if (!owner && !groupmbr) {
+ if (zdp->z_mode & S_IXOTH) {
+ mutex_exit(&zdp->z_acl_lock);
+ return (0);
+ }
+ }
+
+ mutex_exit(&zdp->z_acl_lock);
+
+slow:
+ DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+ ZFS_ENTER(ZTOZSB(zdp));
+ error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+ ZFS_EXIT(ZTOZSB(zdp));
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+ uint32_t working_mode;
+ int error;
+ int is_attr;
+ boolean_t check_privs;
+ znode_t *xzp;
+ znode_t *check_zp = zp;
+ mode_t needed_bits;
+ uid_t owner;
+
+ is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode));
+
+ /*
+ * If attribute then validate against base file
+ */
+ if (is_attr) {
+ if ((error = zfs_zget(ZTOZSB(zp),
+ zp->z_xattr_parent, &xzp)) != 0) {
+ return (error);
+ }
+
+ check_zp = xzp;
+
+ /*
+ * fixup mode to map to xattr perms
+ */
+
+ if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+ mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+ mode |= ACE_WRITE_NAMED_ATTRS;
+ }
+
+ if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+ mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+ mode |= ACE_READ_NAMED_ATTRS;
+ }
+ }
+
+ owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid),
+ cr, ZFS_OWNER);
+ /*
+ * Map the bits required to the standard inode flags
+ * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits
+ * mapped by working_mode (currently missing) in missing_bits.
+ * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+ * needed_bits.
+ */
+ needed_bits = 0;
+
+ working_mode = mode;
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+ owner == crgetuid(cr))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= S_IRUSR;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ needed_bits |= S_IWUSR;
+ if (working_mode & ACE_EXECUTE)
+ needed_bits |= S_IXUSR;
+
+ if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+ &check_privs, skipaclchk, cr)) == 0) {
+ if (is_attr)
+ iput(ZTOI(xzp));
+ return (secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+ needed_bits, needed_bits));
+ }
+
+ if (error && !check_privs) {
+ if (is_attr)
+ iput(ZTOI(xzp));
+ return (error);
+ }
+
+ if (error && (flags & V_APPEND)) {
+ error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+ }
+
+ if (error && check_privs) {
+ mode_t checkmode = 0;
+
+ /*
+ * First check for implicit owner permission on
+ * read_acl/read_attributes
+ */
+
+ error = 0;
+ ASSERT(working_mode != 0);
+
+ if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+ owner == crgetuid(cr)))
+ working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+ if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+ ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= S_IRUSR;
+ if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+ ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+ checkmode |= S_IWUSR;
+ if (working_mode & ACE_EXECUTE)
+ checkmode |= S_IXUSR;
+
+ error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner,
+ needed_bits & ~checkmode, needed_bits);
+
+ if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+ error = secpolicy_vnode_chown(cr, owner);
+ if (error == 0 && (working_mode & ACE_WRITE_ACL))
+ error = secpolicy_vnode_setdac(cr, owner);
+
+ if (error == 0 && (working_mode &
+ (ACE_DELETE|ACE_DELETE_CHILD)))
+ error = secpolicy_vnode_remove(cr);
+
+ if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+ error = secpolicy_vnode_chown(cr, owner);
+ }
+ if (error == 0) {
+ /*
+ * See if any bits other than those already checked
+ * for are still present. If so then return EACCES
+ */
+ if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+ error = SET_ERROR(EACCES);
+ }
+ }
+ } else if (error == 0) {
+ error = secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+ needed_bits, needed_bits);
+ }
+
+ if (is_attr)
+ iput(ZTOI(xzp));
+
+ return (error);
+}
+
+/*
+ * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into
+ * native ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+ return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+ int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+ return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+ mode_t available_perms, cred_t *cr)
+{
+ int error;
+ uid_t downer;
+
+ downer = zfs_fuid_map_id(ZTOZSB(dzp), KUID_TO_SUID(ZTOI(dzp)->i_uid),
+ cr, ZFS_OWNER);
+
+ error = secpolicy_vnode_access2(cr, ZTOI(dzp),
+ downer, available_perms, S_IWUSR|S_IXUSR);
+
+ if (error == 0)
+ error = zfs_sticky_remove_access(dzp, zp, cr);
+
+ return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ * -------------------------------------------------------
+ * | Parent Dir | Target Object Permissions |
+ * | permissions | |
+ * -------------------------------------------------------
+ * | | ACL Allows | ACL Denies| Delete |
+ * | | Delete | Delete | unspecified|
+ * -------------------------------------------------------
+ * | ACL Allows | Permit | Permit | Permit |
+ * | DELETE_CHILD | |
+ * -------------------------------------------------------
+ * | ACL Denies | Permit | Deny | Deny |
+ * | DELETE_CHILD | | | |
+ * -------------------------------------------------------
+ * | ACL specifies | | | |
+ * | only allow | Permit | Permit | Permit |
+ * | write and | | | |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * | ACL denies | | | |
+ * | write and | Permit | Deny | Deny |
+ * | execute | | | |
+ * -------------------------------------------------------
+ * ^
+ * |
+ * No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+ uint32_t dzp_working_mode = 0;
+ uint32_t zp_working_mode = 0;
+ int dzp_error, zp_error;
+ mode_t available_perms;
+ boolean_t dzpcheck_privs = B_TRUE;
+ boolean_t zpcheck_privs = B_TRUE;
+
+ /*
+ * We want specific DELETE permissions to
+ * take precedence over WRITE/EXECUTE. We don't
+ * want an ACL such as this to mess us up.
+ * user:joe:write_data:deny,user:joe:delete:allow
+ *
+ * However, deny permissions may ultimately be overridden
+ * by secpolicy_vnode_access().
+ *
+ * We will ask for all of the necessary permissions and then
+ * look at the working modes from the directory and target object
+ * to determine what was found.
+ */
+
+ if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+ return (SET_ERROR(EPERM));
+
+ /*
+ * First row
+ * If the directory permissions allow the delete, we are done.
+ */
+ if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ /*
+ * If target object has delete permission then we are done
+ */
+ if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+ &zpcheck_privs, B_FALSE, cr)) == 0)
+ return (0);
+
+ ASSERT(dzp_error && zp_error);
+
+ if (!dzpcheck_privs)
+ return (dzp_error);
+ if (!zpcheck_privs)
+ return (zp_error);
+
+ /*
+ * Second row
+ *
+ * If directory returns EACCES then delete_child was denied
+ * due to deny delete_child. In this case send the request through
+ * secpolicy_vnode_remove(). We don't use zfs_delete_final_check()
+ * since that *could* allow the delete based on write/execute permission
+ * and we want delete permissions to override write/execute.
+ */
+
+ if (dzp_error == EACCES)
+ return (secpolicy_vnode_remove(cr));
+
+ /*
+ * Third Row
+ * only need to see if we have write/execute on directory.
+ */
+
+ dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+ &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+
+ if (dzp_error != 0 && !dzpcheck_privs)
+ return (dzp_error);
+
+ /*
+ * Fourth row
+ */
+
+ available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : S_IWUSR;
+ available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : S_IXUSR;
+
+ return (zfs_delete_final_check(zp, dzp, available_perms, cr));
+
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+ znode_t *tzp, cred_t *cr)
+{
+ int add_perm;
+ int error;
+
+ if (szp->z_pflags & ZFS_AV_QUARANTINED)
+ return (SET_ERROR(EACCES));
+
+ add_perm = S_ISDIR(ZTOI(szp)->i_mode) ?
+ ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+ /*
+ * Rename permissions are combination of delete permission +
+ * add file/subdir permission.
+ */
+
+ /*
+ * first make sure we do the delete portion.
+ *
+ * If that succeeds then check for add_file/add_subdir permissions
+ */
+
+ if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+ return (error);
+
+ /*
+ * If we have a tzp, see if we can delete it?
+ */
+ if (tzp) {
+ if ((error = zfs_zaccess_delete(tdzp, tzp, cr)))
+ return (error);
+ }
+
+ /*
+ * Now check for add permissions
+ */
+ error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+ return (error);
+}
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
new file mode 100644
index 000000000..1e61ef06d
--- /dev/null
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -0,0 +1,1240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ * Rohan Puri <[email protected]>
+ * Brian Behlendorf <[email protected]>
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2018 George Melikov. All Rights Reserved.
+ * Copyright (c) 2019 Datto, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future. The elements are built dynamically, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab. We have three
+ * types of objects:
+ *
+ * ctldir ------> snapshotdir -------> snapshot
+ * |
+ * |
+ * V
+ * mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding inode.
+ *
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount procedure. Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * zfsvfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfsvfs_t. However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfsvfs_t to make NFS happy.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zpl.h>
+#include <sys/mntent.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ * - be attached to both trees, and
+ * - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid. This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static krwlock_t zfs_snapshot_lock;
+
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+int zfs_admin_snapshot = 0;
+
+typedef struct {
+ char *se_name; /* full snapshot name */
+ char *se_path; /* full mount path */
+ spa_t *se_spa; /* pool spa */
+ uint64_t se_objsetid; /* snapshot objset id */
+ struct dentry *se_root_dentry; /* snapshot root dentry */
+ taskqid_t se_taskqid; /* scheduled unmount taskqid */
+ avl_node_t se_node_name; /* zfs_snapshots_by_name link */
+ avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+ zfs_refcount_t se_refcount; /* reference count */
+} zfs_snapentry_t;
+
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point. No reference is taken.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa,
+ uint64_t objsetid, struct dentry *root_dentry)
+{
+ zfs_snapentry_t *se;
+
+ se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+ se->se_name = strdup(full_name);
+ se->se_path = strdup(full_path);
+ se->se_spa = spa;
+ se->se_objsetid = objsetid;
+ se->se_root_dentry = root_dentry;
+ se->se_taskqid = TASKQID_INVALID;
+
+ zfs_refcount_create(&se->se_refcount);
+
+ return (se);
+}
+
+/*
+ * Free a zfs_snapentry_t the caller must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
+{
+ zfs_refcount_destroy(&se->se_refcount);
+ strfree(se->se_name);
+ strfree(se->se_path);
+
+ kmem_free(se, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Hold a reference on the zfs_snapentry_t.
+ */
+static void
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
+{
+ zfs_refcount_add(&se->se_refcount, NULL);
+}
+
+/*
+ * Release a reference on the zfs_snapentry_t. When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+ if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
+ zfsctl_snapshot_free(se);
+}
+
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+ ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+ zfsctl_snapshot_hold(se);
+ avl_add(&zfs_snapshots_by_name, se);
+ avl_add(&zfs_snapshots_by_objsetid, se);
+}
+
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+ ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+ avl_remove(&zfs_snapshots_by_name, se);
+ avl_remove(&zfs_snapshots_by_objsetid, se);
+ zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
+static int
+snapentry_compare_by_name(const void *a, const void *b)
+{
+ const zfs_snapentry_t *se_a = a;
+ const zfs_snapentry_t *se_b = b;
+ int ret;
+
+ ret = strcmp(se_a->se_name, se_b->se_name);
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
+ */
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+ const zfs_snapentry_t *se_a = a;
+ const zfs_snapentry_t *se_b = b;
+
+ if (se_a->se_spa != se_b->se_spa)
+ return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
+
+ if (se_a->se_objsetid < se_b->se_objsetid)
+ return (-1);
+ else if (se_a->se_objsetid > se_b->se_objsetid)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure. The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele(). If the snapname is not found
+ * NULL will be returned.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(char *snapname)
+{
+ zfs_snapentry_t *se, search;
+
+ ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+ search.se_name = snapname;
+ se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+ if (se)
+ zfsctl_snapshot_hold(se);
+
+ return (se);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname. In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
+{
+ zfs_snapentry_t *se, search;
+
+ ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+ search.se_spa = spa;
+ search.se_objsetid = objsetid;
+ se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+ if (se)
+ zfsctl_snapshot_hold(se);
+
+ return (se);
+}
+
+/*
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
+ */
+static int
+zfsctl_snapshot_rename(char *old_snapname, char *new_snapname)
+{
+ zfs_snapentry_t *se;
+
+ ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+
+ se = zfsctl_snapshot_find_by_name(old_snapname);
+ if (se == NULL)
+ return (SET_ERROR(ENOENT));
+
+ zfsctl_snapshot_remove(se);
+ strfree(se->se_name);
+ se->se_name = strdup(new_snapname);
+ zfsctl_snapshot_add(se);
+ zfsctl_snapshot_rele(se);
+
+ return (0);
+}
+
+/*
+ * Delayed task responsible for unmounting an expired automounted snapshot.
+ */
+static void
+snapentry_expire(void *data)
+{
+ zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+ spa_t *spa = se->se_spa;
+ uint64_t objsetid = se->se_objsetid;
+
+ if (zfs_expire_snapshot <= 0) {
+ zfsctl_snapshot_rele(se);
+ return;
+ }
+
+ se->se_taskqid = TASKQID_INVALID;
+ (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+ zfsctl_snapshot_rele(se);
+
+ /*
+ * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+ * This can occur when the snapshot is busy.
+ */
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+ zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+ zfsctl_snapshot_rele(se);
+ }
+ rw_exit(&zfs_snapshot_lock);
+}
+
+/*
+ * Cancel an automatic unmount of a snapname. This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
+ */
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+{
+ if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
+ se->se_taskqid = TASKQID_INVALID;
+ zfsctl_snapshot_rele(se);
+ }
+}
+
+/*
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
+ */
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
+{
+ ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
+
+ if (delay <= 0)
+ return;
+
+ zfsctl_snapshot_hold(se);
+ se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
+ snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
+}
+
+/*
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now. Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
+ */
+int
+zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
+{
+ zfs_snapentry_t *se;
+ int error = ENOENT;
+
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+ zfsctl_snapshot_unmount_cancel(se);
+ zfsctl_snapshot_unmount_delay_impl(se, delay);
+ zfsctl_snapshot_rele(se);
+ error = 0;
+ }
+ rw_exit(&zfs_snapshot_lock);
+
+ return (error);
+}
+
+/*
+ * Check if snapname is currently mounted. Returned non-zero when mounted
+ * and zero when unmounted.
+ */
+static boolean_t
+zfsctl_snapshot_ismounted(char *snapname)
+{
+ zfs_snapentry_t *se;
+ boolean_t ismounted = B_FALSE;
+
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+ zfsctl_snapshot_rele(se);
+ ismounted = B_TRUE;
+ }
+ rw_exit(&zfs_snapshot_lock);
+
+ return (ismounted);
+}
+
+/*
+ * Check if the given inode is a part of the virtual .zfs directory.
+ */
+boolean_t
+zfsctl_is_node(struct inode *ip)
+{
+ return (ITOZ(ip)->z_is_ctldir);
+}
+
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
+{
+ return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
+}
+
+/*
+ * Allocate a new inode with the passed id and ops.
+ */
+static struct inode *
+zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
+ const struct file_operations *fops, const struct inode_operations *ops)
+{
+ inode_timespec_t now;
+ struct inode *ip;
+ znode_t *zp;
+
+ ip = new_inode(zfsvfs->z_sb);
+ if (ip == NULL)
+ return (NULL);
+
+ now = current_time(ip);
+ zp = ITOZ(ip);
+ ASSERT3P(zp->z_dirlocks, ==, NULL);
+ ASSERT3P(zp->z_acl_cached, ==, NULL);
+ ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ zp->z_id = id;
+ zp->z_unlinked = B_FALSE;
+ zp->z_atime_dirty = B_FALSE;
+ zp->z_zn_prefetch = B_FALSE;
+ zp->z_moved = B_FALSE;
+ zp->z_is_sa = B_FALSE;
+ zp->z_is_mapped = B_FALSE;
+ zp->z_is_ctldir = B_TRUE;
+ zp->z_is_stale = B_FALSE;
+ zp->z_sa_hdl = NULL;
+ zp->z_blksz = 0;
+ zp->z_seq = 0;
+ zp->z_mapcnt = 0;
+ zp->z_size = 0;
+ zp->z_pflags = 0;
+ zp->z_mode = 0;
+ zp->z_sync_cnt = 0;
+ ip->i_generation = 0;
+ ip->i_ino = id;
+ ip->i_mode = (S_IFDIR | S_IRWXUGO);
+ ip->i_uid = SUID_TO_KUID(0);
+ ip->i_gid = SGID_TO_KGID(0);
+ ip->i_blkbits = SPA_MINBLOCKSHIFT;
+ ip->i_atime = now;
+ ip->i_mtime = now;
+ ip->i_ctime = now;
+ ip->i_fop = fops;
+ ip->i_op = ops;
+#if defined(IOP_XATTR)
+ ip->i_opflags &= ~IOP_XATTR;
+#endif
+
+ if (insert_inode_locked(ip)) {
+ unlock_new_inode(ip);
+ iput(ip);
+ return (NULL);
+ }
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes++;
+ membar_producer();
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ unlock_new_inode(ip);
+
+ return (ip);
+}
+
+/*
+ * Lookup the inode with given id, it will be allocated if needed.
+ */
+static struct inode *
+zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
+ const struct file_operations *fops, const struct inode_operations *ops)
+{
+ struct inode *ip = NULL;
+
+ while (ip == NULL) {
+ ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
+ if (ip)
+ break;
+
+ /* May fail due to concurrent zfsctl_inode_alloc() */
+ ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+ }
+
+ return (ip);
+}
+
+/*
+ * Create the '.zfs' directory. This directory is cached as part of the VFS
+ * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1. This reference
+ * is removed when the ctldir is destroyed in the unmount. All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
+ */
+int
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+ ASSERT(zfsvfs->z_ctldir == NULL);
+
+ zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
+ &zpl_fops_root, &zpl_ops_root);
+ if (zfsvfs->z_ctldir == NULL)
+ return (SET_ERROR(ENOENT));
+
+ return (0);
+}
+
+/*
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+ if (zfsvfs->z_issnap) {
+ zfs_snapentry_t *se;
+ spa_t *spa = zfsvfs->z_os->os_spa;
+ uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+
+ rw_enter(&zfs_snapshot_lock, RW_WRITER);
+ se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
+ if (se != NULL)
+ zfsctl_snapshot_remove(se);
+ rw_exit(&zfs_snapshot_lock);
+ if (se != NULL) {
+ zfsctl_snapshot_unmount_cancel(se);
+ zfsctl_snapshot_rele(se);
+ }
+ } else if (zfsvfs->z_ctldir) {
+ iput(zfsvfs->z_ctldir);
+ zfsvfs->z_ctldir = NULL;
+ }
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+struct inode *
+zfsctl_root(znode_t *zp)
+{
+ ASSERT(zfs_has_ctldir(zp));
+ igrab(ZTOZSB(zp)->z_ctldir);
+ return (ZTOZSB(zp)->z_ctldir);
+}
+
+/*
+ * Generate a long fid to indicate a snapdir. We encode whether snapdir is
+ * already mounted in gen field. We do this because nfsd lookup will not
+ * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
+ * this and do automount and return ESTALE to force nfsd revalidate and follow
+ * mount.
+ */
+static int
+zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
+{
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint32_t gen = 0;
+ uint64_t object;
+ uint64_t objsetid;
+ int i;
+ struct dentry *dentry;
+
+ if (fidp->fid_len < LONG_FID_LEN) {
+ fidp->fid_len = LONG_FID_LEN;
+ return (SET_ERROR(ENOSPC));
+ }
+
+ object = ip->i_ino;
+ objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
+ zfid->zf_len = LONG_FID_LEN;
+
+ dentry = d_obtain_alias(igrab(ip));
+ if (!IS_ERR(dentry)) {
+ gen = !!d_mountpoint(dentry);
+ dput(dentry);
+ }
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ zlfid->zf_setgen[i] = 0;
+
+ return (0);
+}
+
+/*
+ * Generate an appropriate fid for an entry in the .zfs directory.
+ */
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsctl_is_snapdir(ip)) {
+ ZFS_EXIT(zfsvfs);
+ return (zfsctl_snapdir_fid(ip, fidp));
+ }
+
+ if (fidp->fid_len < SHORT_FID_LEN) {
+ fidp->fid_len = SHORT_FID_LEN;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = SHORT_FID_LEN;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* .zfs znodes always have a generation number of 0 */
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = 0;
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
+static int
+zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
+ char *full_name)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
+ return (SET_ERROR(EILSEQ));
+
+ dmu_objset_name(os, full_name);
+ if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ (void) strcat(full_name, "@");
+ (void) strcat(full_name, snap_name);
+
+ return (0);
+}
+
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
+static int
+zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
+ int path_len, char *full_path)
+{
+ objset_t *os = zfsvfs->z_os;
+ fstrans_cookie_t cookie;
+ char *snapname;
+ boolean_t case_conflict;
+ uint64_t id, pos = 0;
+ int error = 0;
+
+ if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
+ return (SET_ERROR(ENOENT));
+
+ cookie = spl_fstrans_mark();
+ snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ while (error == 0) {
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os,
+ ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
+ &case_conflict);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto out;
+
+ if (id == objsetid)
+ break;
+ }
+
+ snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
+ zfsvfs->z_vfs->vfs_mntpoint, snapname);
+out:
+ kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+ spl_fstrans_unmark(cookie);
+
+ return (error);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp,
+ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (strcmp(name, "..") == 0) {
+ *ipp = dip->i_sb->s_root->d_inode;
+ } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+ *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
+ &zpl_fops_snapdir, &zpl_ops_snapdir);
+ } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+ *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
+ &zpl_fops_shares, &zpl_ops_shares);
+ } else {
+ *ipp = NULL;
+ }
+
+ if (*ipp == NULL)
+ error = SET_ERROR(ENOENT);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory. Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ */
+int
+zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp,
+ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ uint64_t id;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
+ &simple_dir_operations, &simple_dir_inode_operations);
+ if (*ipp == NULL)
+ error = SET_ERROR(ENOENT);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name. The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
+int
+zfsctl_snapdir_rename(struct inode *sdip, char *snm,
+ struct inode *tdip, char *tnm, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(sdip);
+ char *to, *from, *real, *fsname;
+ int error;
+
+ if (!zfs_admin_snapshot)
+ return (SET_ERROR(EACCES));
+
+ ZFS_ENTER(zfsvfs);
+
+ to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+ ZFS_MAX_DATASET_NAME_LEN, NULL);
+ if (error == 0) {
+ snm = real;
+ } else if (error != ENOTSUP) {
+ goto out;
+ }
+ }
+
+ dmu_objset_name(zfsvfs->z_os, fsname);
+
+ error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
+ ZFS_MAX_DATASET_NAME_LEN, from);
+ if (error == 0)
+ error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
+ ZFS_MAX_DATASET_NAME_LEN, to);
+ if (error == 0)
+ error = zfs_secpolicy_rename_perms(from, to, cr);
+ if (error != 0)
+ goto out;
+
+ /*
+ * Cannot move snapshots out of the snapdir.
+ */
+ if (sdip != tdip) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /*
+ * No-op when names are identical.
+ */
+ if (strcmp(snm, tnm) == 0) {
+ error = 0;
+ goto out;
+ }
+
+ rw_enter(&zfs_snapshot_lock, RW_WRITER);
+
+ error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
+ if (error == 0)
+ (void) zfsctl_snapshot_rename(snm, tnm);
+
+ rw_exit(&zfs_snapshot_lock);
+out:
+ kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
+int
+zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ char *snapname, *real;
+ int error;
+
+ if (!zfs_admin_snapshot)
+ return (SET_ERROR(EACCES));
+
+ ZFS_ENTER(zfsvfs);
+
+ snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+ ZFS_MAX_DATASET_NAME_LEN, NULL);
+ if (error == 0) {
+ name = real;
+ } else if (error != ENOTSUP) {
+ goto out;
+ }
+ }
+
+ error = zfsctl_snapshot_name(ITOZSB(dip), name,
+ ZFS_MAX_DATASET_NAME_LEN, snapname);
+ if (error == 0)
+ error = zfs_secpolicy_destroy_perms(snapname, cr);
+ if (error != 0)
+ goto out;
+
+ error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+ if ((error == 0) || (error == ENOENT))
+ error = dsl_destroy_snapshot(snapname, B_FALSE);
+out:
+ kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
+ */
+int
+zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap,
+ struct inode **ipp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ char *dsname;
+ int error;
+
+ if (!zfs_admin_snapshot)
+ return (SET_ERROR(EACCES));
+
+ dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+ if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
+ error = SET_ERROR(EILSEQ);
+ goto out;
+ }
+
+ dmu_objset_name(zfsvfs->z_os, dsname);
+
+ error = zfs_secpolicy_snapshot_perms(dsname, cr);
+ if (error != 0)
+ goto out;
+
+ if (error == 0) {
+ error = dmu_objset_snapshot_one(dsname, dirname);
+ if (error != 0)
+ goto out;
+
+ error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+ 0, cr, NULL, NULL);
+ }
+out:
+ kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+ return (error);
+}
+
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort. In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+int
+zfsctl_snapshot_unmount(char *snapname, int flags)
+{
+ char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
+ NULL };
+ char *envp[] = { NULL };
+ zfs_snapentry_t *se;
+ int error;
+
+ rw_enter(&zfs_snapshot_lock, RW_READER);
+ if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+ rw_exit(&zfs_snapshot_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ rw_exit(&zfs_snapshot_lock);
+
+ if (flags & MNT_FORCE)
+ argv[4] = "-fn";
+ argv[5] = se->se_path;
+ dprintf("unmount; path=%s\n", se->se_path);
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ zfsctl_snapshot_rele(se);
+
+
+ /*
+ * The umount system utility will return 256 on error. We must
+ * assume this error is because the file system is busy so it is
+ * converted to the more sensible EBUSY.
+ */
+ if (error)
+ error = SET_ERROR(EBUSY);
+
+ return (error);
+}
+
+int
+zfsctl_snapshot_mount(struct path *path, int flags)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *ip = dentry->d_inode;
+ zfsvfs_t *zfsvfs;
+ zfsvfs_t *snap_zfsvfs;
+ zfs_snapentry_t *se;
+ char *full_name, *full_path;
+ char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
+ NULL };
+ char *envp[] = { NULL };
+ int error;
+ struct path spath;
+
+ if (ip == NULL)
+ return (SET_ERROR(EISDIR));
+
+ zfsvfs = ITOZSB(ip);
+ ZFS_ENTER(zfsvfs);
+
+ full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+ full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
+ ZFS_MAX_DATASET_NAME_LEN, full_name);
+ if (error)
+ goto error;
+
+ /*
+ * Construct a mount point path from sb of the ctldir inode and dirent
+ * name, instead of from d_path(), so that chroot'd process doesn't fail
+ * on mount.zfs(8).
+ */
+ snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
+ zfsvfs->z_vfs->vfs_mntpoint, dname(dentry));
+
+ /*
+ * Multiple concurrent automounts of a snapshot are never allowed.
+ * The snapshot may be manually mounted as many times as desired.
+ */
+ if (zfsctl_snapshot_ismounted(full_name)) {
+ error = 0;
+ goto error;
+ }
+
+ /*
+ * Attempt to mount the snapshot from user space. Normally this
+ * would be done using the vfs_kern_mount() function, however that
+ * function is marked GPL-only and cannot be used. On error we
+ * careful to log the real error to the console and return EISDIR
+ * to safely abort the automount. This should be very rare.
+ *
+ * If the user mode helper happens to return EBUSY, a concurrent
+ * mount is already in progress in which case the error is ignored.
+ * Take note that if the program was executed successfully the return
+ * value from call_usermodehelper() will be (exitcode << 8 + signal).
+ */
+ dprintf("mount; name=%s path=%s\n", full_name, full_path);
+ argv[5] = full_name;
+ argv[6] = full_path;
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (error) {
+ if (!(error & MOUNT_BUSY << 8)) {
+ zfs_dbgmsg("Unable to automount %s error=%d",
+ full_path, error);
+ error = SET_ERROR(EISDIR);
+ } else {
+ /*
+ * EBUSY, this could mean a concurrent mount, or the
+ * snapshot has already been mounted at completely
+ * different place. We return 0 so VFS will retry. For
+ * the latter case the VFS will retry several times
+ * and return ELOOP, which is probably not a very good
+ * behavior.
+ */
+ error = 0;
+ }
+ goto error;
+ }
+
+ /*
+ * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+ * to identify this as an automounted filesystem.
+ */
+ spath = *path;
+ path_get(&spath);
+ if (zpl_follow_down_one(&spath)) {
+ snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
+ snap_zfsvfs->z_parent = zfsvfs;
+ dentry = spath.dentry;
+ spath.mnt->mnt_flags |= MNT_SHRINKABLE;
+
+ rw_enter(&zfs_snapshot_lock, RW_WRITER);
+ se = zfsctl_snapshot_alloc(full_name, full_path,
+ snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
+ dentry);
+ zfsctl_snapshot_add(se);
+ zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+ rw_exit(&zfs_snapshot_lock);
+ }
+ path_put(&spath);
+error:
+ kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
+ kmem_free(full_path, MAXPATHLEN);
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Get the snapdir inode from fid
+ */
+int
+zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
+ struct inode **ipp)
+{
+ int error;
+ struct path path;
+ char *mnt;
+ struct dentry *dentry;
+
+ mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+ error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
+ MAXPATHLEN, mnt);
+ if (error)
+ goto out;
+
+ /* Trigger automount */
+ error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+ if (error)
+ goto out;
+
+ path_put(&path);
+ /*
+ * Get the snapdir inode. Note, we don't want to use the above
+ * path because it contains the root of the snapshot rather
+ * than the snapdir.
+ */
+ *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
+ if (*ipp == NULL) {
+ error = SET_ERROR(ENOENT);
+ goto out;
+ }
+
+ /* check gen, see zfsctl_snapdir_fid */
+ dentry = d_obtain_alias(igrab(*ipp));
+ if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
+ iput(*ipp);
+ *ipp = NULL;
+ error = SET_ERROR(ENOENT);
+ }
+ if (!IS_ERR(dentry))
+ dput(dentry);
+out:
+ kmem_free(mnt, MAXPATHLEN);
+ return (error);
+}
+
+int
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+ int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ struct inode *ip;
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_shares_dir == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+ error = zfs_lookup(ZTOI(dzp), name, &ip, 0, cr, NULL, NULL);
+ iput(ZTOI(dzp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories. Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+ avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+ se_node_name));
+ avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+ sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+ se_node_objsetid));
+ rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories. In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+ avl_destroy(&zfs_snapshots_by_name);
+ avl_destroy(&zfs_snapshots_by_objsetid);
+ rw_destroy(&zfs_snapshot_lock);
+}
+
+module_param(zfs_admin_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c
new file mode 100644
index 000000000..538533d27
--- /dev/null
+++ b/module/os/linux/zfs/zfs_debug.c
@@ -0,0 +1,253 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+
+typedef struct zfs_dbgmsg {
+ procfs_list_node_t zdm_node;
+ time_t zdm_timestamp;
+ int zdm_size;
+ char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+procfs_list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * # Disable the kernel debug message log.
+ * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_show_header(struct seq_file *f)
+{
+ seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
+ return (0);
+}
+
+static int
+zfs_dbgmsg_show(struct seq_file *f, void *p)
+{
+ zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
+ seq_printf(f, "%-12llu %-s\n",
+ (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+ return (0);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+ while (zfs_dbgmsg_size > max_size) {
+ zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
+ if (zdm == NULL)
+ return;
+
+ int size = zdm->zdm_size;
+ kmem_free(zdm, size);
+ zfs_dbgmsg_size -= size;
+ }
+}
+
+static int
+zfs_dbgmsg_clear(procfs_list_t *procfs_list)
+{
+ mutex_enter(&zfs_dbgmsgs.pl_lock);
+ zfs_dbgmsg_purge(0);
+ mutex_exit(&zfs_dbgmsgs.pl_lock);
+ return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+ procfs_list_install("zfs",
+ "dbgmsg",
+ 0600,
+ &zfs_dbgmsgs,
+ zfs_dbgmsg_show,
+ zfs_dbgmsg_show_header,
+ zfs_dbgmsg_clear,
+ offsetof(zfs_dbgmsg_t, zdm_node));
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+ procfs_list_uninstall(&zfs_dbgmsgs);
+ zfs_dbgmsg_purge(0);
+
+ /*
+ * TODO - decide how to make this permanent
+ */
+#ifdef _KERNEL
+ procfs_list_destroy(&zfs_dbgmsgs);
+#endif
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+ /*
+ * To enable this:
+ *
+ * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+ */
+ if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+ __dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+ int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+ zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
+ zdm->zdm_size = size;
+ zdm->zdm_timestamp = gethrestime_sec();
+ strcpy(zdm->zdm_msg, buf);
+
+ mutex_enter(&zfs_dbgmsgs.pl_lock);
+ procfs_list_add(&zfs_dbgmsgs, zdm);
+ zfs_dbgmsg_size += size;
+ zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+ mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+
+#ifdef _KERNEL
+
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+ int line, const char *fmt, ...)
+{
+ const char *newfile;
+ va_list adx;
+ size_t size;
+ char *buf;
+ char *nl;
+ int i;
+ char *prefix = (dprint) ? "dprintf: " : "";
+
+ size = 1024;
+ buf = kmem_alloc(size, KM_SLEEP);
+
+ /*
+ * Get rid of annoying prefix to filename.
+ */
+ newfile = strrchr(file, '/');
+ if (newfile != NULL) {
+ newfile = newfile + 1; /* Get rid of leading / */
+ } else {
+ newfile = file;
+ }
+
+ i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
+
+ if (i < size) {
+ va_start(adx, fmt);
+ (void) vsnprintf(buf + i, size - i, fmt, adx);
+ va_end(adx);
+ }
+
+ /*
+ * Get rid of trailing newline for dprintf logs.
+ */
+ if (dprint && buf[0] != '\0') {
+ nl = &buf[strlen(buf) - 1];
+ if (*nl == '\n')
+ *nl = '\0';
+ }
+
+ /*
+ * To get this data enable the zfs__dprintf trace point as shown:
+ *
+ * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
+ * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
+ * $ echo 0 > /sys/kernel/debug/tracing/trace
+ *
+ * # Dump the ring buffer.
+ * $ cat /sys/kernel/debug/tracing/trace
+ */
+ DTRACE_PROBE1(zfs__dprintf, char *, buf);
+
+ /*
+ * To get this data:
+ *
+ * $ cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * To clear the buffer:
+ * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
+ */
+ __zfs_dbgmsg(buf);
+
+ kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ ssize_t ret __attribute__((unused));
+
+ /*
+ * We use write() in this function instead of printf()
+ * so it is safe to call from a signal handler.
+ */
+ ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+ ret = write(STDOUT_FILENO, tag, strlen(tag));
+ ret = write(STDOUT_FILENO, ") START:\n", 9);
+
+ mutex_enter(&zfs_dbgmsgs.pl_lock);
+ for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
+ zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) {
+ ret = write(STDOUT_FILENO, zdm->zdm_msg,
+ strlen(zdm->zdm_msg));
+ ret = write(STDOUT_FILENO, "\n", 1);
+ }
+
+ ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+ ret = write(STDOUT_FILENO, tag, strlen(tag));
+ ret = write(STDOUT_FILENO, ") END\n", 6);
+
+ mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
+
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c
new file mode 100644
index 000000000..6bdad737c
--- /dev/null
+++ b/module/os/linux/zfs/zfs_dir.c
@@ -0,0 +1,1205 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/mode.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_vnops.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt,
+ boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+{
+ boolean_t conflict = B_FALSE;
+ int error;
+
+ if (zfsvfs->z_norm) {
+ size_t bufsz = 0;
+ char *buf = NULL;
+
+ if (rpnp) {
+ buf = rpnp->pn_buf;
+ bufsz = rpnp->pn_bufsize;
+ }
+
+ /*
+ * In the non-mixed case we only expect there would ever
+ * be one match, but we need to use the normalizing lookup.
+ */
+ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+ zoid, mt, buf, bufsz, &conflict);
+ } else {
+ error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+ }
+
+ /*
+ * Allow multiple entries provided the first entry is
+ * the object id. Non-zpl consumers may safely make
+ * use of the additional space.
+ *
+ * XXX: This should be a feature flag for compatibility
+ */
+ if (error == EOVERFLOW)
+ error = 0;
+
+ if (zfsvfs->z_norm && !error && deflags)
+ *deflags = conflict ? ED_CASE_CONFLICT : 0;
+
+ *zoid = ZFS_DIRENT_OBJ(*zoid);
+
+ return (error);
+}
+
+/*
+ * Lock a directory entry. A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object. As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ * dzp - znode for directory
+ * name - name of entry to lock
+ * flag - ZNEW: if the entry already exists, fail with EEXIST.
+ * ZEXISTS: if the entry does not exist, fail with ENOENT.
+ * ZSHARED: allow concurrent access with other ZSHARED callers.
+ * ZXATTR: we want dzp's xattr directory
+ * ZCILOOK: On a mixed sensitivity file system,
+ * this lookup should be case-insensitive.
+ * ZCIEXACT: On a purely case-insensitive file system,
+ * this lookup should be case-sensitive.
+ * ZRENAMING: we are locking for renaming, force narrow locks
+ * ZHAVELOCK: Don't grab the z_name_lock for this call. The
+ * current thread already holds it.
+ *
+ * Output arguments:
+ * zpp - pointer to the znode for the entry (NULL if there isn't one)
+ * dlpp - pointer to the dirlock for this entry (NULL on error)
+ * direntflags - (case-insensitive lookup only)
+ * flags if multiple case-sensitive matches exist in directory
+ * realpnp - (case-insensitive lookup only)
+ * actual name matched within the directory
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ * NOTE: For case-insensitive file systems we take wide locks (see below),
+ * but return znode pointers to a single match.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
+ int flag, int *direntflags, pathname_t *realpnp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zfs_dirlock_t *dl;
+ boolean_t update;
+ matchtype_t mt = 0;
+ uint64_t zoid;
+ int error = 0;
+ int cmpflags;
+
+ *zpp = NULL;
+ *dlpp = NULL;
+
+ /*
+ * Verify that we are not trying to lock '.', '..', or '.zfs'
+ */
+ if ((name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
+ (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
+ return (SET_ERROR(EEXIST));
+
+ /*
+ * Case sensitivity and normalization preferences are set when
+ * the file system is created. These are stored in the
+ * zfsvfs->z_case and zfsvfs->z_norm fields. These choices
+ * affect what vnodes can be cached in the DNLC, how we
+ * perform zap lookups, and the "width" of our dirlocks.
+ *
+ * A normal dirlock locks a single name. Note that with
+ * normalization a name can be composed multiple ways, but
+ * when normalized, these names all compare equal. A wide
+ * dirlock locks multiple names. We need these when the file
+ * system is supporting mixed-mode access. It is sometimes
+ * necessary to lock all case permutations of file name at
+ * once so that simultaneous case-insensitive/case-sensitive
+ * behaves as rationally as possible.
+ */
+
+ /*
+ * When matching we may need to normalize & change case according to
+ * FS settings.
+ *
+ * Note that a normalized match is necessary for a case insensitive
+ * filesystem when the lookup request is not exact because normalization
+ * can fold case independent of normalizing code point sequences.
+ *
+ * See the table above zfs_dropname().
+ */
+ if (zfsvfs->z_norm != 0) {
+ mt = MT_NORMALIZE;
+
+ /*
+ * Determine if the match needs to honor the case specified in
+ * lookup, and if so keep track of that so that during
+ * normalization we don't fold case.
+ */
+ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
+ (flag & ZCIEXACT)) ||
+ (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
+ mt |= MT_MATCH_CASE;
+ }
+ }
+
+ /*
+ * Only look in or update the DNLC if we are looking for the
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name.
+ *
+ * Maybe can add TO-UPPERed version of name to dnlc in ci-only
+ * case for performance improvement?
+ */
+ update = !zfsvfs->z_norm ||
+ (zfsvfs->z_case == ZFS_CASE_MIXED &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+
+ /*
+ * ZRENAMING indicates we are in a situation where we should
+ * take narrow locks regardless of the file system's
+ * preferences for normalizing and case folding. This will
+ * prevent us deadlocking trying to grab the same wide lock
+ * twice if the two names happen to be case-insensitive
+ * matches.
+ */
+ if (flag & ZRENAMING)
+ cmpflags = 0;
+ else
+ cmpflags = zfsvfs->z_norm;
+
+ /*
+ * Wait until there are no locks on this name.
+ *
+ * Don't grab the lock if it is already held. However, cannot
+ * have both ZSHARED and ZHAVELOCK together.
+ */
+ ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
+ if (!(flag & ZHAVELOCK))
+ rw_enter(&dzp->z_name_lock, RW_READER);
+
+ mutex_enter(&dzp->z_lock);
+ for (;;) {
+ if (dzp->z_unlinked && !(flag & ZXATTR)) {
+ mutex_exit(&dzp->z_lock);
+ if (!(flag & ZHAVELOCK))
+ rw_exit(&dzp->z_name_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
+ if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
+ U8_UNICODE_LATEST, &error) == 0) || error != 0)
+ break;
+ }
+ if (error != 0) {
+ mutex_exit(&dzp->z_lock);
+ if (!(flag & ZHAVELOCK))
+ rw_exit(&dzp->z_name_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ if (dl == NULL) {
+ /*
+ * Allocate a new dirlock and add it to the list.
+ */
+ dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+ cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+ dl->dl_name = name;
+ dl->dl_sharecnt = 0;
+ dl->dl_namelock = 0;
+ dl->dl_namesize = 0;
+ dl->dl_dzp = dzp;
+ dl->dl_next = dzp->z_dirlocks;
+ dzp->z_dirlocks = dl;
+ break;
+ }
+ if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+ break;
+ cv_wait(&dl->dl_cv, &dzp->z_lock);
+ }
+
+ /*
+ * If the z_name_lock was NOT held for this dirlock record it.
+ */
+ if (flag & ZHAVELOCK)
+ dl->dl_namelock = 1;
+
+ if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+ /*
+ * We're the second shared reference to dl. Make a copy of
+ * dl_name in case the first thread goes away before we do.
+ * Note that we initialize the new name before storing its
+ * pointer into dl_name, because the first thread may load
+ * dl->dl_name at any time. It'll either see the old value,
+ * which belongs to it, or the new shared copy; either is OK.
+ */
+ dl->dl_namesize = strlen(dl->dl_name) + 1;
+ name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+ bcopy(dl->dl_name, name, dl->dl_namesize);
+ dl->dl_name = name;
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * We have a dirlock on the name. (Note that it is the dirlock,
+ * not the dzp's z_lock, that protects the name in the zap object.)
+ * See if there's an object by this name; if so, put a hold on it.
+ */
+ if (flag & ZXATTR) {
+ error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+ sizeof (zoid));
+ if (error == 0)
+ error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
+ } else {
+ error = zfs_match_find(zfsvfs, dzp, name, mt,
+ update, direntflags, realpnp, &zoid);
+ }
+ if (error) {
+ if (error != ENOENT || (flag & ZEXISTS)) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ } else {
+ if (flag & ZNEW) {
+ zfs_dirent_unlock(dl);
+ return (SET_ERROR(EEXIST));
+ }
+ error = zfs_zget(zfsvfs, zoid, zpp);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ return (error);
+ }
+ }
+
+ *dlpp = dl;
+
+ return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfs_dirlock_t **prev_dl, *cur_dl;
+
+ mutex_enter(&dzp->z_lock);
+
+ if (!dl->dl_namelock)
+ rw_exit(&dzp->z_name_lock);
+
+ if (dl->dl_sharecnt > 1) {
+ dl->dl_sharecnt--;
+ mutex_exit(&dzp->z_lock);
+ return;
+ }
+ prev_dl = &dzp->z_dirlocks;
+ while ((cur_dl = *prev_dl) != dl)
+ prev_dl = &cur_dl->dl_next;
+ *prev_dl = dl->dl_next;
+ cv_broadcast(&dl->dl_cv);
+ mutex_exit(&dzp->z_lock);
+
+ if (dl->dl_namesize != 0)
+ kmem_free(dl->dl_name, dl->dl_namesize);
+ cv_destroy(&dl->dl_cv);
+ kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ * no directory entries are actually stored for them. If this is
+ * the root of a filesystem, then '.zfs' is also treated as a
+ * special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, struct inode **ipp, int flags,
+ int *deflg, pathname_t *rpnp)
+{
+ zfs_dirlock_t *dl;
+ znode_t *zp;
+ int error = 0;
+ uint64_t parent;
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ *ipp = ZTOI(dzp);
+ igrab(*ipp);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the inode pointer for the snapshot directory.
+ */
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
+
+ if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
+ error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+ "snapshot", ipp, 0, kcred, NULL, NULL);
+ return (error);
+ }
+ rw_enter(&dzp->z_parent_lock, RW_READER);
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *ipp = ZTOI(zp);
+ rw_exit(&dzp->z_parent_lock);
+ } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+ *ipp = zfsctl_root(dzp);
+ } else {
+ int zf;
+
+ zf = ZEXISTS | ZSHARED;
+ if (flags & FIGNORECASE)
+ zf |= ZCILOOK;
+
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+ if (error == 0) {
+ *ipp = ZTOI(zp);
+ zfs_dirent_unlock(dl);
+ dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ }
+ rpnp = NULL;
+ }
+
+ if ((flags & FIGNORECASE) && rpnp && !error)
+ (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
+
+ return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+ ASSERT(zp->z_unlinked);
+ ASSERT(ZTOI(zp)->i_nlink == 0);
+
+ VERIFY3U(0, ==,
+ zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+static void
+zfs_unlinked_drain_task(void *arg)
+{
+ zfsvfs_t *zfsvfs = arg;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ dmu_object_info_t doi;
+ znode_t *zp;
+ int error;
+
+ ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
+
+ /*
+ * Iterate over the contents of the unlinked set.
+ */
+ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+ zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
+ zap_cursor_advance(&zc)) {
+
+ /*
+ * See what kind of object we have in list
+ */
+
+ error = dmu_object_info(zfsvfs->z_os,
+ zap.za_first_integer, &doi);
+ if (error != 0)
+ continue;
+
+ ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+ (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+ /*
+ * We need to re-mark these list entries for deletion,
+ * so we pull them back into core and set zp->z_unlinked.
+ */
+ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+ /*
+ * We may pick up znodes that are already marked for deletion.
+ * This could happen during the purge of an extended attribute
+ * directory. All we need to do is skip over them, since they
+ * are already in the system marked z_unlinked.
+ */
+ if (error != 0)
+ continue;
+
+ zp->z_unlinked = B_TRUE;
+
+ /*
+ * iput() is Linux's equivalent to illumos' VN_RELE(). It will
+ * decrement the inode's ref count and may cause the inode to be
+ * synchronously freed. We interrupt freeing of this inode, by
+ * checking the return value of dmu_objset_zfs_unmounting() in
+ * dmu_free_long_range(), when an unmount is requested.
+ */
+ iput(ZTOI(zp));
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+ }
+ zap_cursor_fini(&zc);
+
+ zfsvfs->z_draining = B_FALSE;
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+}
+
+/*
+ * Sets z_draining then tries to dispatch async unlinked drain.
+ * If that fails executes synchronous unlinked drain.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+ ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
+
+ zfsvfs->z_draining = B_TRUE;
+ zfsvfs->z_drain_cancel = B_FALSE;
+
+ zfsvfs->z_drain_task = taskq_dispatch(
+ dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
+ zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
+ if (zfsvfs->z_drain_task == TASKQID_INVALID) {
+ zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
+ zfs_unlinked_drain_task(zfsvfs);
+ }
+}
+
+/*
+ * Wait for the unlinked drain taskq task to stop. This will interrupt the
+ * unlinked set processing if it is in progress.
+ */
+void
+zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
+{
+ ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+
+ if (zfsvfs->z_draining) {
+ zfsvfs->z_drain_cancel = B_TRUE;
+ taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+ zfsvfs->z_draining = B_FALSE;
+ }
+}
+
+/*
+ * Delete the entire contents of a directory. Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ * so there is no need to lock its entries before deletion.
+ * Also, it assumes the directory contents is *only* regular
+ * files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ zfs_dirlock_t dl;
+ int skipped = 0;
+ int error;
+
+ for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+ (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+ zap_cursor_advance(&zc)) {
+ error = zfs_zget(zfsvfs,
+ ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+ if (error) {
+ skipped += 1;
+ continue;
+ }
+
+ ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
+ S_ISLNK(ZTOI(xzp)->i_mode));
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ /* Is this really needed ? */
+ zfs_sa_upgrade_txholds(tx, xzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ zfs_iput_async(ZTOI(xzp));
+ skipped += 1;
+ continue;
+ }
+ bzero(&dl, sizeof (dl));
+ dl.dl_dzp = dzp;
+ dl.dl_name = zap.za_name;
+
+ error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ if (error)
+ skipped += 1;
+ dmu_tx_commit(tx);
+
+ zfs_iput_async(ZTOI(xzp));
+ }
+ zap_cursor_fini(&zc);
+ if (error != ENOENT)
+ skipped += 1;
+ return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ znode_t *xzp = NULL;
+ dmu_tx_t *tx;
+ uint64_t acl_obj;
+ uint64_t xattr_obj;
+ uint64_t links;
+ int error;
+
+ ASSERT(ZTOI(zp)->i_nlink == 0);
+ ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+
+ /*
+ * If this is an attribute directory, purge its contents.
+ */
+ if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
+ if (zfs_purgedir(zp) != 0) {
+ /*
+ * Not enough space to delete some xattrs.
+ * Leave it in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+
+ return;
+ }
+ }
+
+ /*
+ * Free up all the data in the file. We don't do this for directories
+ * because we need truncate and remove to be in the same tx, like in
+ * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
+ * an inconsistent truncated zap object in the delete queue. Note a
+ * truncated file is harmless since it only contains user data.
+ */
+ if (S_ISREG(ZTOI(zp)->i_mode)) {
+ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+ if (error) {
+ /*
+ * Not enough space or we were interrupted by unmount.
+ * Leave the file in the unlinked set.
+ */
+ zfs_znode_dmu_fini(zp);
+ return;
+ }
+ }
+
+ /*
+ * If the file has extended attributes, we're going to unlink
+ * the xattr dir.
+ */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT(error == 0);
+ }
+
+ acl_obj = zfs_external_acl(zp);
+
+ /*
+ * Set up the final transaction.
+ */
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ if (xzp) {
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+ if (acl_obj)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ /*
+ * Not enough space to delete the file. Leave it in the
+ * unlinked set, leaking it until the fs is remounted (at
+ * which point we'll call zfs_unlinked_drain() to process it).
+ */
+ dmu_tx_abort(tx);
+ zfs_znode_dmu_fini(zp);
+ goto out;
+ }
+
+ if (xzp) {
+ ASSERT(error == 0);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
+ clear_nlink(ZTOI(xzp)); /* no more links to it */
+ links = 0;
+ VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &links, sizeof (links), tx));
+ mutex_exit(&xzp->z_lock);
+ zfs_unlinked_add(xzp, tx);
+ }
+
+ /* Remove this znode from the unlinked set */
+ VERIFY3U(0, ==,
+ zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+ dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+ zfs_znode_delete(zp, tx);
+
+ dmu_tx_commit(tx);
+out:
+ if (xzp)
+ zfs_iput_async(ZTOI(xzp));
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+ uint64_t de = zp->z_id;
+
+ if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
+ de |= IFTODT(mode) << 60;
+ return (de);
+}
+
+/*
+ * Link zp into dl. Can fail in the following cases :
+ * - if zp has been unlinked.
+ * - if the number of entries with the same hash (aka. colliding entries)
+ * exceed the capacity of a leaf-block of fatzap and splitting of the
+ * leaf-block does not help.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t value;
+ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ uint64_t links;
+ int count = 0;
+ int error;
+
+ mutex_enter(&zp->z_lock);
+
+ if (!(flag & ZRENAMING)) {
+ if (zp->z_unlinked) { /* no new links to unlinked zp */
+ ASSERT(!(flag & (ZNEW | ZEXISTS)));
+ mutex_exit(&zp->z_lock);
+ return (SET_ERROR(ENOENT));
+ }
+ if (!(flag & ZNEW)) {
+ /*
+ * ZNEW nodes come from zfs_mknode() where the link
+ * count has already been initialised
+ */
+ inc_nlink(ZTOI(zp));
+ links = ZTOI(zp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ }
+ }
+
+ value = zfs_dirent(zp, zp->z_mode);
+ error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
+ &value, tx);
+
+ /*
+ * zap_add could fail to add the entry if it exceeds the capacity of the
+ * leaf-block and zap_leaf_split() failed to help.
+ * The caller of this routine is responsible for failing the transaction
+ * which will rollback the SA updates done above.
+ */
+ if (error != 0) {
+ if (!(flag & ZRENAMING) && !(flag & ZNEW))
+ drop_nlink(ZTOI(zp));
+ mutex_exit(&zp->z_lock);
+ return (error);
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &dzp->z_id, sizeof (dzp->z_id));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (!(flag & ZNEW)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ mutex_exit(&zp->z_lock);
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_size++;
+ if (zp_is_dir)
+ inc_nlink(ZTOI(dzp));
+ links = ZTOI(dzp)->i_nlink;
+ count = 0;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &links, sizeof (links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+ mutex_exit(&dzp->z_lock);
+
+ return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type | z_norm | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0 | 0 | 0 (exact)
+ * CS norm | formX | 0 | MT_NORMALIZE
+ * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE
+ * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper | ZCILOOK | MT_NORMALIZE
+ * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ * formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+ int flag)
+{
+ int error;
+
+ if (ZTOZSB(zp)->z_norm) {
+ matchtype_t mt = MT_NORMALIZE;
+
+ if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
+ (flag & ZCIEXACT)) ||
+ (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
+ !(flag & ZCILOOK))) {
+ mt |= MT_MATCH_CASE;
+ }
+
+ error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
+ dl->dl_name, mt, tx);
+ } else {
+ error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
+ tx);
+ }
+
+ return (error);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
+ * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+ boolean_t *unlinkedp)
+{
+ znode_t *dzp = dl->dl_dzp;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+ boolean_t unlinked = B_FALSE;
+ sa_bulk_attr_t bulk[5];
+ uint64_t mtime[2], ctime[2];
+ uint64_t links;
+ int count = 0;
+ int error;
+
+ if (!(flag & ZRENAMING)) {
+ mutex_enter(&zp->z_lock);
+
+ if (zp_is_dir && !zfs_dirempty(zp)) {
+ mutex_exit(&zp->z_lock);
+ return (SET_ERROR(ENOTEMPTY));
+ }
+
+ /*
+ * If we get here, we are going to try to remove the object.
+ * First try removing the name from the directory; if that
+ * fails, return the error.
+ */
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0) {
+ mutex_exit(&zp->z_lock);
+ return (error);
+ }
+
+ if (ZTOI(zp)->i_nlink <= zp_is_dir) {
+ zfs_panic_recover("zfs: link count on %lu is %u, "
+ "should be at least %u", zp->z_id,
+ (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
+ set_nlink(ZTOI(zp), zp_is_dir + 1);
+ }
+ drop_nlink(ZTOI(zp));
+ if (ZTOI(zp)->i_nlink == zp_is_dir) {
+ zp->z_unlinked = B_TRUE;
+ clear_nlink(ZTOI(zp));
+ unlinked = B_TRUE;
+ } else {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+ zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+ ctime);
+ }
+ links = ZTOI(zp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ count = 0;
+ ASSERT(error == 0);
+ mutex_exit(&zp->z_lock);
+ } else {
+ error = zfs_dropname(dl, zp, dzp, tx, flag);
+ if (error != 0)
+ return (error);
+ }
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_size--; /* one dirent removed */
+ if (zp_is_dir)
+ drop_nlink(ZTOI(dzp)); /* ".." link from zp */
+ links = ZTOI(dzp)->i_nlink;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+ NULL, &links, sizeof (links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &dzp->z_size, sizeof (dzp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+ NULL, ctime, sizeof (ctime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+ NULL, mtime, sizeof (mtime));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+ zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+ mutex_exit(&dzp->z_lock);
+
+ if (unlinkedp != NULL)
+ *unlinkedp = unlinked;
+ else if (unlinked)
+ zfs_unlinked_add(zp, tx);
+
+ return (0);
+}
+
+/*
+ * Indicate whether the directory is empty. Works with or without z_lock
+ * held, but can only be consider a hint in the latter case. Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ *
+ * The internal ZAP size, rather than zp->z_size, needs to be checked since
+ * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ uint64_t count;
+ int error;
+
+ if (dzp->z_dirlocks != NULL)
+ return (B_FALSE);
+
+ error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
+ if (error != 0 || count != 0)
+ return (B_FALSE);
+
+ return (B_TRUE);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, struct inode **xipp, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ znode_t *xzp;
+ dmu_tx_t *tx;
+ int error;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+#ifdef DEBUG
+ uint64_t parent;
+#endif
+
+ *xipp = NULL;
+
+ if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
+ return (error);
+
+ if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+ &acl_ids)) != 0)
+ return (error);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ return (error);
+ }
+ zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef DEBUG
+ error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+ sizeof (xzp->z_id), tx));
+
+ if (!zp->z_unlinked)
+ (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+ xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+
+ *xipp = ZTOI(xzp);
+
+ return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ * IN: zp - znode to obtain attribute directory from
+ * cr - credentials of caller
+ * flags - flags from the VOP_LOOKUP call
+ *
+ * OUT: xipp - pointer to extended attribute znode
+ *
+ * RETURN: 0 on success
+ * error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, struct inode **xipp, cred_t *cr, int flags)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ znode_t *xzp;
+ zfs_dirlock_t *dl;
+ vattr_t va;
+ int error;
+top:
+ error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+ if (error)
+ return (error);
+
+ if (xzp != NULL) {
+ *xipp = ZTOI(xzp);
+ zfs_dirent_unlock(dl);
+ return (0);
+ }
+
+ if (!(flags & CREATE_XATTR_DIR)) {
+ zfs_dirent_unlock(dl);
+ return (SET_ERROR(ENOENT));
+ }
+
+ if (zfs_is_readonly(zfsvfs)) {
+ zfs_dirent_unlock(dl);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * The ability to 'create' files in an attribute
+ * directory comes from the write_xattr permission on the base file.
+ *
+ * The ability to 'search' an attribute directory requires
+ * read_xattr permission on the base file.
+ *
+ * Once in a directory the ability to read/write attributes
+ * is controlled by the permissions on the attribute file.
+ */
+ va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
+ va.va_mode = S_IFDIR | S_ISVTX | 0777;
+ zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+ va.va_dentry = NULL;
+ error = zfs_make_xattrdir(zp, &va, xipp, cr);
+ zfs_dirent_unlock(dl);
+
+ if (error == ERESTART) {
+ /* NB: we already did dmu_tx_wait() if necessary */
+ goto top;
+ }
+
+ return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ * you own the directory,
+ * you own the entry,
+ * you have write access to the entry,
+ * or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+ uid_t uid;
+ uid_t downer;
+ uid_t fowner;
+ zfsvfs_t *zfsvfs = ZTOZSB(zdp);
+
+ if (zfsvfs->z_replay)
+ return (0);
+
+ if ((zdp->z_mode & S_ISVTX) == 0)
+ return (0);
+
+ downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
+ cr, ZFS_OWNER);
+ fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
+ cr, ZFS_OWNER);
+
+ if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+ zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
+ return (0);
+ else
+ return (secpolicy_vnode_remove(cr));
+}
diff --git a/module/os/linux/zfs/zfs_sysfs.c b/module/os/linux/zfs/zfs_sysfs.c
new file mode 100644
index 000000000..bb7f3b69a
--- /dev/null
+++ b/module/os/linux/zfs/zfs_sysfs.c
@@ -0,0 +1,661 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/kmem.h>
+#include <sys/fs/zfs.h>
+#include <linux/kobject.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#error kernel builds only
+#endif
+
+/*
+ * ZFS Module sysfs support
+ *
+ * This extends our sysfs '/sys/module/zfs' entry to include feature
+ * and property attributes. The primary consumer of this information
+ * is user processes, like the zfs CLI, that need to know what the
+ * current loaded ZFS module supports. The libzfs binary will consult
+ * this information when instantiating the zfs|zpool property tables
+ * and the pool features table.
+ *
+ * The added top-level directories are:
+ * /sys/module/zfs
+ * ├── features.kernel
+ * ├── features.pool
+ * ├── properties.dataset
+ * └── properties.pool
+ *
+ * The local interface for the zfs kobjects includes:
+ * zfs_kobj_init()
+ * zfs_kobj_add()
+ * zfs_kobj_release()
+ * zfs_kobj_add_attr()
+ * zfs_kobj_fini()
+ */
+
+/*
+ * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs'
+ */
+struct zfs_mod_kobj;
+typedef struct zfs_mod_kobj zfs_mod_kobj_t;
+
+struct zfs_mod_kobj {
+ struct kobject zko_kobj;
+ struct kobj_type zko_kobj_type;
+ struct sysfs_ops zko_sysfs_ops;
+ size_t zko_attr_count;
+ struct attribute *zko_attr_list; /* allocated */
+ struct attribute **zko_default_attrs; /* allocated */
+ size_t zko_child_count;
+ zfs_mod_kobj_t *zko_children; /* allocated */
+};
+
+#define ATTR_TABLE_SIZE(cnt) (sizeof (struct attribute) * (cnt))
+/* Note +1 for NULL terminator slot */
+#define DEFAULT_ATTR_SIZE(cnt) (sizeof (struct attribute *) * (cnt + 1))
+#define CHILD_TABLE_SIZE(cnt) (sizeof (zfs_mod_kobj_t) * (cnt))
+
+/*
+ * These are the top-level kobjects under '/sys/module/zfs/'
+ */
+static zfs_mod_kobj_t kernel_features_kobj;
+static zfs_mod_kobj_t pool_features_kobj;
+static zfs_mod_kobj_t dataset_props_kobj;
+static zfs_mod_kobj_t pool_props_kobj;
+
+/*
+ * The show function is used to provide the content
+ * of an attribute into a PAGE_SIZE buffer.
+ */
+typedef ssize_t (*sysfs_show_func)(struct kobject *, struct attribute *,
+ char *);
+
+static void
+zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
+{
+ /* finalize any child kobjects */
+ if (zkobj->zko_child_count != 0) {
+ ASSERT(zkobj->zko_children);
+ for (int i = 0; i < zkobj->zko_child_count; i++)
+ zfs_kobj_fini(&zkobj->zko_children[i]);
+ }
+
+ /* kobject_put() will call zfs_kobj_release() to release memory */
+ kobject_del(&zkobj->zko_kobj);
+ kobject_put(&zkobj->zko_kobj);
+}
+
+static void
+zfs_kobj_release(struct kobject *kobj)
+{
+ zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj);
+
+ if (zkobj->zko_attr_list != NULL) {
+ ASSERT3S(zkobj->zko_attr_count, !=, 0);
+ kmem_free(zkobj->zko_attr_list,
+ ATTR_TABLE_SIZE(zkobj->zko_attr_count));
+ zkobj->zko_attr_list = NULL;
+ }
+
+ if (zkobj->zko_default_attrs != NULL) {
+ kmem_free(zkobj->zko_default_attrs,
+ DEFAULT_ATTR_SIZE(zkobj->zko_attr_count));
+ zkobj->zko_default_attrs = NULL;
+ }
+
+ if (zkobj->zko_child_count != 0) {
+ ASSERT(zkobj->zko_children);
+
+ kmem_free(zkobj->zko_children,
+ CHILD_TABLE_SIZE(zkobj->zko_child_count));
+ zkobj->zko_child_count = 0;
+ zkobj->zko_children = NULL;
+ }
+
+ zkobj->zko_attr_count = 0;
+}
+
+#ifndef sysfs_attr_init
+#define sysfs_attr_init(attr) do {} while (0)
+#endif
+
+static void
+zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
+{
+ VERIFY3U(attr_num, <, zkobj->zko_attr_count);
+ ASSERT(zkobj->zko_attr_list);
+ ASSERT(zkobj->zko_default_attrs);
+
+ zkobj->zko_attr_list[attr_num].name = attr_name;
+ zkobj->zko_attr_list[attr_num].mode = 0444;
+ zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
+ sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
+}
+
+static int
+zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt,
+ sysfs_show_func show_func)
+{
+ /*
+ * Initialize object's attributes. Count can be zero.
+ */
+ if (attr_cnt > 0) {
+ zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt),
+ KM_SLEEP);
+ if (zkobj->zko_attr_list == NULL)
+ return (ENOMEM);
+ }
+ /* this will always have at least one slot for NULL termination */
+ zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt),
+ KM_SLEEP);
+ if (zkobj->zko_default_attrs == NULL) {
+ if (zkobj->zko_attr_list != NULL) {
+ kmem_free(zkobj->zko_attr_list,
+ ATTR_TABLE_SIZE(attr_cnt));
+ }
+ return (ENOMEM);
+ }
+ zkobj->zko_attr_count = attr_cnt;
+ zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs;
+
+ if (child_cnt > 0) {
+ zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt),
+ KM_SLEEP);
+ if (zkobj->zko_children == NULL) {
+ if (zkobj->zko_default_attrs != NULL) {
+ kmem_free(zkobj->zko_default_attrs,
+ DEFAULT_ATTR_SIZE(attr_cnt));
+ }
+ if (zkobj->zko_attr_list != NULL) {
+ kmem_free(zkobj->zko_attr_list,
+ ATTR_TABLE_SIZE(attr_cnt));
+ }
+ return (ENOMEM);
+ }
+ zkobj->zko_child_count = child_cnt;
+ }
+
+ zkobj->zko_sysfs_ops.show = show_func;
+ zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops;
+ zkobj->zko_kobj_type.release = zfs_kobj_release;
+
+ return (0);
+}
+
+static int
+zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
+{
+ /* zko_default_attrs must be NULL terminated */
+ ASSERT(zkobj->zko_default_attrs != NULL);
+ ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL);
+
+ kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
+ return (kobject_add(&zkobj->zko_kobj, parent, name));
+}
+
+/*
+ * Each zfs property has these common attributes
+ */
+static const char *zprop_attrs[] = {
+ "type",
+ "readonly",
+ "setonce",
+ "visible",
+ "values",
+ "default",
+ "datasets" /* zfs properties only */
+};
+
+#define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs)
+#define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1)
+
+static const char *zprop_types[] = {
+ "number",
+ "string",
+ "index",
+};
+
+typedef struct zfs_type_map {
+ zfs_type_t ztm_type;
+ const char *ztm_name;
+} zfs_type_map_t;
+
+static zfs_type_map_t type_map[] = {
+ {ZFS_TYPE_FILESYSTEM, "filesystem"},
+ {ZFS_TYPE_SNAPSHOT, "snapshot"},
+ {ZFS_TYPE_VOLUME, "volume"},
+ {ZFS_TYPE_BOOKMARK, "bookmark"}
+};
+
+/*
+ * Show the content for a zfs property attribute
+ */
+static ssize_t
+zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
+ char *buf, size_t buflen)
+{
+ const char *show_str;
+ char number[32];
+
+ /* For dataset properties list the dataset types that apply */
+ if (strcmp(attr_name, "datasets") == 0 &&
+ property->pd_types != ZFS_TYPE_POOL) {
+ int len = 0;
+
+ for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
+ if (type_map[i].ztm_type & property->pd_types) {
+ len += snprintf(buf + len, buflen - len, "%s ",
+ type_map[i].ztm_name);
+ }
+ }
+ len += snprintf(buf + len, buflen - len, "\n");
+ return (len);
+ }
+
+ if (strcmp(attr_name, "type") == 0) {
+ show_str = zprop_types[property->pd_proptype];
+ } else if (strcmp(attr_name, "readonly") == 0) {
+ show_str = property->pd_attr == PROP_READONLY ? "1" : "0";
+ } else if (strcmp(attr_name, "setonce") == 0) {
+ show_str = property->pd_attr == PROP_ONETIME ? "1" : "0";
+ } else if (strcmp(attr_name, "visible") == 0) {
+ show_str = property->pd_visible ? "1" : "0";
+ } else if (strcmp(attr_name, "values") == 0) {
+ show_str = property->pd_values ? property->pd_values : "";
+ } else if (strcmp(attr_name, "default") == 0) {
+ switch (property->pd_proptype) {
+ case PROP_TYPE_NUMBER:
+ (void) snprintf(number, sizeof (number), "%llu",
+ (u_longlong_t)property->pd_numdefault);
+ show_str = number;
+ break;
+ case PROP_TYPE_STRING:
+ show_str = property->pd_strdefault ?
+ property->pd_strdefault : "";
+ break;
+ case PROP_TYPE_INDEX:
+ if (zprop_index_to_string(property->pd_propnum,
+ property->pd_numdefault, &show_str,
+ property->pd_types) != 0) {
+ show_str = "";
+ }
+ break;
+ default:
+ return (0);
+ }
+ } else {
+ return (0);
+ }
+
+ return (snprintf(buf, buflen, "%s\n", show_str));
+}
+
+static ssize_t
+dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj));
+ zprop_desc_t *prop_tbl = zfs_prop_get_table();
+ ssize_t len;
+
+ ASSERT3U(prop, <, ZFS_NUM_PROPS);
+
+ len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+ return (len);
+}
+
+static ssize_t
+pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj));
+ zprop_desc_t *prop_tbl = zpool_prop_get_table();
+ ssize_t len;
+
+ ASSERT3U(prop, <, ZPOOL_NUM_PROPS);
+
+ len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+ return (len);
+}
+
+/*
+ * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel'
+ *
+ * This list is intended for kernel features that don't have a pool feature
+ * association or that extend existing user kernel interfaces.
+ *
+ * A user processes can easily check if the running zfs kernel module
+ * supports the new feature.
+ */
+static const char *zfs_kernel_features[] = {
+ /* --> Add new kernel features here */
+ "com.delphix:vdev_initialize",
+ "org.zfsonlinux:vdev_trim",
+};
+
+#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features)
+
+static ssize_t
+kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ if (strcmp(attr->name, "supported") == 0)
+ return (snprintf(buf, PAGE_SIZE, "yes\n"));
+ return (0);
+}
+
+static void
+kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name)
+{
+ zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot];
+
+ ASSERT3U(slot, <, KERNEL_FEATURE_COUNT);
+ ASSERT(name);
+
+ int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show);
+ if (err)
+ return;
+
+ zfs_kobj_add_attr(zfs_kobj, 0, "supported");
+
+ err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+ if (err)
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+ /*
+ * Create a parent kobject to host kernel features.
+ *
+ * '/sys/module/zfs/features.kernel'
+ */
+ int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT,
+ kernel_feature_show);
+ if (err)
+ return (err);
+ err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES);
+ if (err) {
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+ return (err);
+ }
+
+ /*
+ * Now create a kobject for each feature.
+ *
+ * '/sys/module/zfs/features.kernel/<feature>'
+ */
+ for (int f = 0; f < KERNEL_FEATURE_COUNT; f++)
+ kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]);
+
+ return (0);
+}
+
+/*
+ * Each pool feature has these common attributes
+ */
+static const char *pool_feature_attrs[] = {
+ "description",
+ "guid",
+ "uname",
+ "readonly_compatible",
+ "required_for_mos",
+ "activate_on_enable",
+ "per_dataset"
+};
+
+#define ZPOOL_FEATURE_ATTR_COUNT ARRAY_SIZE(pool_feature_attrs)
+
+/*
+ * Show the content for the given zfs pool feature attribute
+ */
+static ssize_t
+pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+ spa_feature_t fid;
+
+ if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0)
+ return (0);
+
+ ASSERT3U(fid, <, SPA_FEATURES);
+
+ zfeature_flags_t flags = spa_feature_table[fid].fi_flags;
+ const char *show_str = NULL;
+
+ if (strcmp(attr->name, "description") == 0) {
+ show_str = spa_feature_table[fid].fi_desc;
+ } else if (strcmp(attr->name, "guid") == 0) {
+ show_str = spa_feature_table[fid].fi_guid;
+ } else if (strcmp(attr->name, "uname") == 0) {
+ show_str = spa_feature_table[fid].fi_uname;
+ } else if (strcmp(attr->name, "readonly_compatible") == 0) {
+ show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0";
+ } else if (strcmp(attr->name, "required_for_mos") == 0) {
+ show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0";
+ } else if (strcmp(attr->name, "activate_on_enable") == 0) {
+ show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0";
+ } else if (strcmp(attr->name, "per_dataset") == 0) {
+ show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0";
+ }
+ if (show_str == NULL)
+ return (0);
+
+ return (snprintf(buf, PAGE_SIZE, "%s\n", show_str));
+}
+
+static void
+pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid,
+ const char *name)
+{
+ zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid];
+
+ ASSERT3U(fid, <, SPA_FEATURES);
+ ASSERT(name);
+
+ int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0,
+ pool_feature_show);
+ if (err)
+ return;
+
+ for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++)
+ zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]);
+
+ err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+ if (err)
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+ /*
+ * Create a parent kobject to host pool features.
+ *
+ * '/sys/module/zfs/features.pool'
+ */
+ int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show);
+ if (err)
+ return (err);
+ err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES);
+ if (err) {
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+ return (err);
+ }
+
+ /*
+ * Now create a kobject for each feature.
+ *
+ * '/sys/module/zfs/features.pool/<feature>'
+ */
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++)
+ pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid);
+
+ return (0);
+}
+
+typedef struct prop_to_kobj_arg {
+ zprop_desc_t *p2k_table;
+ zfs_mod_kobj_t *p2k_parent;
+ sysfs_show_func p2k_show_func;
+ int p2k_attr_count;
+} prop_to_kobj_arg_t;
+
+static int
+zprop_to_kobj(int prop, void *args)
+{
+ prop_to_kobj_arg_t *data = args;
+ zfs_mod_kobj_t *parent = data->p2k_parent;
+ zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop];
+ const char *name = data->p2k_table[prop].pd_name;
+ int err;
+
+ ASSERT(name);
+
+ err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0,
+ data->p2k_show_func);
+ if (err)
+ return (ZPROP_CONT);
+
+ for (int i = 0; i < data->p2k_attr_count; i++)
+ zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]);
+
+ err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+ if (err)
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+
+ return (ZPROP_CONT);
+}
+
+static int
+zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent,
+ zfs_type_t type)
+{
+ prop_to_kobj_arg_t context;
+ const char *name;
+ int err;
+
+ /*
+ * Create a parent kobject to host properties.
+ *
+ * '/sys/module/zfs/properties.<type>'
+ */
+ if (type == ZFS_TYPE_POOL) {
+ name = ZFS_SYSFS_POOL_PROPERTIES;
+ context.p2k_table = zpool_prop_get_table();
+ context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT;
+ context.p2k_parent = zfs_kobj;
+ context.p2k_show_func = pool_property_show;
+ err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS,
+ pool_property_show);
+ } else {
+ name = ZFS_SYSFS_DATASET_PROPERTIES;
+ context.p2k_table = zfs_prop_get_table();
+ context.p2k_attr_count = ZFS_PROP_ATTR_COUNT;
+ context.p2k_parent = zfs_kobj;
+ context.p2k_show_func = dataset_property_show;
+ err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS,
+ dataset_property_show);
+ }
+
+ if (err)
+ return (err);
+
+ err = zfs_kobj_add(zfs_kobj, parent, name);
+ if (err) {
+ zfs_kobj_release(&zfs_kobj->zko_kobj);
+ return (err);
+ }
+
+ /*
+ * Create a kobject for each property.
+ *
+ * '/sys/module/zfs/properties.<type>/<property>'
+ */
+ (void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE,
+ B_FALSE, type);
+
+ return (err);
+}
+
+void
+zfs_sysfs_init(void)
+{
+ struct kobject *parent;
+#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE)
+ parent = kobject_create_and_add("zfs", fs_kobj);
+#else
+ parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj;
+#endif
+ int err;
+
+ if (parent == NULL)
+ return;
+
+ err = zfs_kernel_features_init(&kernel_features_kobj, parent);
+ if (err)
+ return;
+
+ err = zfs_pool_features_init(&pool_features_kobj, parent);
+ if (err) {
+ zfs_kobj_fini(&kernel_features_kobj);
+ return;
+ }
+
+ err = zfs_sysfs_properties_init(&pool_props_kobj, parent,
+ ZFS_TYPE_POOL);
+ if (err) {
+ zfs_kobj_fini(&kernel_features_kobj);
+ zfs_kobj_fini(&pool_features_kobj);
+ return;
+ }
+
+ err = zfs_sysfs_properties_init(&dataset_props_kobj, parent,
+ ZFS_TYPE_FILESYSTEM);
+ if (err) {
+ zfs_kobj_fini(&kernel_features_kobj);
+ zfs_kobj_fini(&pool_features_kobj);
+ zfs_kobj_fini(&pool_props_kobj);
+ return;
+ }
+}
+
+void
+zfs_sysfs_fini(void)
+{
+ /*
+ * Remove top-level kobjects; each will remove any children kobjects
+ */
+ zfs_kobj_fini(&kernel_features_kobj);
+ zfs_kobj_fini(&pool_features_kobj);
+ zfs_kobj_fini(&dataset_props_kobj);
+ zfs_kobj_fini(&pool_props_kobj);
+}
diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c
new file mode 100644
index 000000000..0914e4b7d
--- /dev/null
+++ b/module/os/linux/zfs/zfs_vfsops.c
@@ -0,0 +1,2562 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa_boot.h>
+#include <sys/objlist.h>
+#include <sys/zpl.h>
+#include <linux/vfs_compat.h>
+#include "zfs_comutil.h"
+
+enum {
+ TOKEN_RO,
+ TOKEN_RW,
+ TOKEN_SETUID,
+ TOKEN_NOSETUID,
+ TOKEN_EXEC,
+ TOKEN_NOEXEC,
+ TOKEN_DEVICES,
+ TOKEN_NODEVICES,
+ TOKEN_DIRXATTR,
+ TOKEN_SAXATTR,
+ TOKEN_XATTR,
+ TOKEN_NOXATTR,
+ TOKEN_ATIME,
+ TOKEN_NOATIME,
+ TOKEN_RELATIME,
+ TOKEN_NORELATIME,
+ TOKEN_NBMAND,
+ TOKEN_NONBMAND,
+ TOKEN_MNTPOINT,
+ TOKEN_LAST,
+};
+
+static const match_table_t zpl_tokens = {
+ { TOKEN_RO, MNTOPT_RO },
+ { TOKEN_RW, MNTOPT_RW },
+ { TOKEN_SETUID, MNTOPT_SETUID },
+ { TOKEN_NOSETUID, MNTOPT_NOSETUID },
+ { TOKEN_EXEC, MNTOPT_EXEC },
+ { TOKEN_NOEXEC, MNTOPT_NOEXEC },
+ { TOKEN_DEVICES, MNTOPT_DEVICES },
+ { TOKEN_NODEVICES, MNTOPT_NODEVICES },
+ { TOKEN_DIRXATTR, MNTOPT_DIRXATTR },
+ { TOKEN_SAXATTR, MNTOPT_SAXATTR },
+ { TOKEN_XATTR, MNTOPT_XATTR },
+ { TOKEN_NOXATTR, MNTOPT_NOXATTR },
+ { TOKEN_ATIME, MNTOPT_ATIME },
+ { TOKEN_NOATIME, MNTOPT_NOATIME },
+ { TOKEN_RELATIME, MNTOPT_RELATIME },
+ { TOKEN_NORELATIME, MNTOPT_NORELATIME },
+ { TOKEN_NBMAND, MNTOPT_NBMAND },
+ { TOKEN_NONBMAND, MNTOPT_NONBMAND },
+ { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" },
+ { TOKEN_LAST, NULL },
+};
+
+static void
+zfsvfs_vfs_free(vfs_t *vfsp)
+{
+ if (vfsp != NULL) {
+ if (vfsp->vfs_mntpoint != NULL)
+ strfree(vfsp->vfs_mntpoint);
+
+ kmem_free(vfsp, sizeof (vfs_t));
+ }
+}
+
+static int
+zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
+{
+ switch (token) {
+ case TOKEN_RO:
+ vfsp->vfs_readonly = B_TRUE;
+ vfsp->vfs_do_readonly = B_TRUE;
+ break;
+ case TOKEN_RW:
+ vfsp->vfs_readonly = B_FALSE;
+ vfsp->vfs_do_readonly = B_TRUE;
+ break;
+ case TOKEN_SETUID:
+ vfsp->vfs_setuid = B_TRUE;
+ vfsp->vfs_do_setuid = B_TRUE;
+ break;
+ case TOKEN_NOSETUID:
+ vfsp->vfs_setuid = B_FALSE;
+ vfsp->vfs_do_setuid = B_TRUE;
+ break;
+ case TOKEN_EXEC:
+ vfsp->vfs_exec = B_TRUE;
+ vfsp->vfs_do_exec = B_TRUE;
+ break;
+ case TOKEN_NOEXEC:
+ vfsp->vfs_exec = B_FALSE;
+ vfsp->vfs_do_exec = B_TRUE;
+ break;
+ case TOKEN_DEVICES:
+ vfsp->vfs_devices = B_TRUE;
+ vfsp->vfs_do_devices = B_TRUE;
+ break;
+ case TOKEN_NODEVICES:
+ vfsp->vfs_devices = B_FALSE;
+ vfsp->vfs_do_devices = B_TRUE;
+ break;
+ case TOKEN_DIRXATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_DIR;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_SAXATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_SA;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_XATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_DIR;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_NOXATTR:
+ vfsp->vfs_xattr = ZFS_XATTR_OFF;
+ vfsp->vfs_do_xattr = B_TRUE;
+ break;
+ case TOKEN_ATIME:
+ vfsp->vfs_atime = B_TRUE;
+ vfsp->vfs_do_atime = B_TRUE;
+ break;
+ case TOKEN_NOATIME:
+ vfsp->vfs_atime = B_FALSE;
+ vfsp->vfs_do_atime = B_TRUE;
+ break;
+ case TOKEN_RELATIME:
+ vfsp->vfs_relatime = B_TRUE;
+ vfsp->vfs_do_relatime = B_TRUE;
+ break;
+ case TOKEN_NORELATIME:
+ vfsp->vfs_relatime = B_FALSE;
+ vfsp->vfs_do_relatime = B_TRUE;
+ break;
+ case TOKEN_NBMAND:
+ vfsp->vfs_nbmand = B_TRUE;
+ vfsp->vfs_do_nbmand = B_TRUE;
+ break;
+ case TOKEN_NONBMAND:
+ vfsp->vfs_nbmand = B_FALSE;
+ vfsp->vfs_do_nbmand = B_TRUE;
+ break;
+ case TOKEN_MNTPOINT:
+ vfsp->vfs_mntpoint = match_strdup(&args[0]);
+ if (vfsp->vfs_mntpoint == NULL)
+ return (SET_ERROR(ENOMEM));
+
+ break;
+ default:
+ break;
+ }
+
+ return (0);
+}
+
+/*
+ * Parse the raw mntopts and return a vfs_t describing the options.
+ */
+static int
+zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
+{
+ vfs_t *tmp_vfsp;
+ int error;
+
+ tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
+
+ if (mntopts != NULL) {
+ substring_t args[MAX_OPT_ARGS];
+ char *tmp_mntopts, *p, *t;
+ int token;
+
+ tmp_mntopts = t = strdup(mntopts);
+ if (tmp_mntopts == NULL)
+ return (SET_ERROR(ENOMEM));
+
+ while ((p = strsep(&t, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ args[0].to = args[0].from = NULL;
+ token = match_token(p, zpl_tokens, args);
+ error = zfsvfs_parse_option(p, token, args, tmp_vfsp);
+ if (error) {
+ strfree(tmp_mntopts);
+ zfsvfs_vfs_free(tmp_vfsp);
+ return (error);
+ }
+ }
+
+ strfree(tmp_mntopts);
+ }
+
+ *vfsp = tmp_vfsp;
+
+ return (0);
+}
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+ return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY));
+}
+
+/*ARGSUSED*/
+int
+zfs_sync(struct super_block *sb, int wait, cred_t *cr)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+ /*
+ * Semantically, the only requirement is that the sync be initiated.
+ * The DMU syncs out txgs frequently, so there's nothing to do.
+ */
+ if (!wait)
+ return (0);
+
+ if (zfsvfs != NULL) {
+ /*
+ * Sync a specific filesystem.
+ */
+ dsl_pool_t *dp;
+
+ ZFS_ENTER(zfsvfs);
+ dp = dmu_objset_pool(zfsvfs->z_os);
+
+ /*
+ * If the system is shutting down, then skip any
+ * filesystems which may exist on a suspended pool.
+ */
+ if (spa_suspended(dp->dp_spa)) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, 0);
+
+ ZFS_EXIT(zfsvfs);
+ } else {
+ /*
+ * Sync all ZFS filesystems. This is what happens when you
+ * run sync(1M). Unlike other filesystems, ZFS honors the
+ * request by waiting for all pools to commit all dirty data.
+ */
+ spa_sync_allpools();
+ }
+
+ return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ struct super_block *sb = zfsvfs->z_sb;
+
+ if (sb == NULL)
+ return;
+ /*
+ * Update SB_NOATIME bit in VFS super block. Since atime update is
+ * determined by atime_needs_update(), atime_needs_update() needs to
+ * return false if atime is turned off, and not unconditionally return
+ * false if atime is turned on.
+ */
+ if (newval)
+ sb->s_flags &= ~SB_NOATIME;
+ else
+ sb->s_flags |= SB_NOATIME;
+}
+
+static void
+relatime_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_relatime = newval;
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ if (newval == ZFS_XATTR_OFF) {
+ zfsvfs->z_flags &= ~ZSB_XATTR;
+ } else {
+ zfsvfs->z_flags |= ZSB_XATTR;
+
+ if (newval == ZFS_XATTR_SA)
+ zfsvfs->z_xattr_sa = B_TRUE;
+ else
+ zfsvfs->z_xattr_sa = B_FALSE;
+ }
+}
+
+static void
+acltype_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+
+ switch (newval) {
+ case ZFS_ACLTYPE_OFF:
+ zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+ zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+ break;
+ case ZFS_ACLTYPE_POSIXACL:
+#ifdef CONFIG_FS_POSIX_ACL
+ zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL;
+ zfsvfs->z_sb->s_flags |= SB_POSIXACL;
+#else
+ zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+ zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+#endif /* CONFIG_FS_POSIX_ACL */
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
+
+ zfsvfs->z_max_blksz = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ struct super_block *sb = zfsvfs->z_sb;
+
+ if (sb == NULL)
+ return;
+
+ if (newval)
+ sb->s_flags |= SB_RDONLY;
+ else
+ sb->s_flags &= ~SB_RDONLY;
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+ zfsvfs_t *zfsvfs = arg;
+ struct super_block *sb = zfsvfs->z_sb;
+
+ if (sb == NULL)
+ return;
+
+ if (newval == TRUE)
+ sb->s_flags |= SB_MANDLOCK;
+ else
+ sb->s_flags &= ~SB_MANDLOCK;
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_vscan = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+ ((zfsvfs_t *)arg)->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
+
+ /*
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and
+ * restore them after we register the callbacks.
+ */
+ if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) {
+ vfsp->vfs_do_readonly = B_TRUE;
+ vfsp->vfs_readonly = B_TRUE;
+ }
+
+ /*
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
+ */
+ ds = dmu_objset_ds(os);
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ error = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+ zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs);
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ if (error)
+ goto unregister;
+
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (vfsp->vfs_do_readonly)
+ readonly_changed_cb(zfsvfs, vfsp->vfs_readonly);
+ if (vfsp->vfs_do_setuid)
+ setuid_changed_cb(zfsvfs, vfsp->vfs_setuid);
+ if (vfsp->vfs_do_exec)
+ exec_changed_cb(zfsvfs, vfsp->vfs_exec);
+ if (vfsp->vfs_do_devices)
+ devices_changed_cb(zfsvfs, vfsp->vfs_devices);
+ if (vfsp->vfs_do_xattr)
+ xattr_changed_cb(zfsvfs, vfsp->vfs_xattr);
+ if (vfsp->vfs_do_atime)
+ atime_changed_cb(zfsvfs, vfsp->vfs_atime);
+ if (vfsp->vfs_do_relatime)
+ relatime_changed_cb(zfsvfs, vfsp->vfs_relatime);
+ if (vfsp->vfs_do_nbmand)
+ nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand);
+
+ return (0);
+
+unregister:
+ dsl_prop_unregister_all(ds, zfsvfs);
+ return (error);
+}
+
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
+ uint64_t *userp, uint64_t *groupp, uint64_t *projectp)
+{
+ sa_hdr_phys_t sa;
+ sa_hdr_phys_t *sap = data;
+ uint64_t flags;
+ int hdrsize;
+ boolean_t swap = B_FALSE;
+
+ /*
+ * Is it a valid type of object to track?
+ */
+ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+ return (SET_ERROR(ENOENT));
+
+ /*
+ * If we have a NULL data pointer
+ * then assume the id's aren't changing and
+ * return EEXIST to the dmu to let it know to
+ * use the same ids
+ */
+ if (data == NULL)
+ return (SET_ERROR(EEXIST));
+
+ if (bonustype == DMU_OT_ZNODE) {
+ znode_phys_t *znp = data;
+ *userp = znp->zp_uid;
+ *groupp = znp->zp_gid;
+ *projectp = ZFS_DEFAULT_PROJID;
+ return (0);
+ }
+
+ if (sap->sa_magic == 0) {
+ /*
+ * This should only happen for newly created files
+ * that haven't had the znode data filled in yet.
+ */
+ *userp = 0;
+ *groupp = 0;
+ *projectp = ZFS_DEFAULT_PROJID;
+ return (0);
+ }
+
+ sa = *sap;
+ if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
+ sa.sa_magic = SA_MAGIC;
+ sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
+ swap = B_TRUE;
+ } else {
+ VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
+ }
+
+ hdrsize = sa_hdrsize(&sa);
+ VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
+
+ *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET));
+ *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET));
+ flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET));
+ if (swap)
+ flags = BSWAP_64(flags);
+
+ if (flags & ZFS_PROJID)
+ *projectp = *((uint64_t *)((uintptr_t)data + hdrsize +
+ SA_PROJID_OFFSET));
+ else
+ *projectp = ZFS_DEFAULT_PROJID;
+
+ if (swap) {
+ *userp = BSWAP_64(*userp);
+ *groupp = BSWAP_64(*groupp);
+ *projectp = BSWAP_64(*projectp);
+ }
+ return (0);
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+ char *domainbuf, int buflen, uid_t *ridp)
+{
+ uint64_t fuid;
+ const char *domain;
+
+ fuid = zfs_strtonum(fuidstr, NULL);
+
+ domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+ if (domain)
+ (void) strlcpy(domainbuf, domain, buflen);
+ else
+ domainbuf[0] = '\0';
+ *ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+ switch (type) {
+ case ZFS_PROP_USERUSED:
+ case ZFS_PROP_USEROBJUSED:
+ return (DMU_USERUSED_OBJECT);
+ case ZFS_PROP_GROUPUSED:
+ case ZFS_PROP_GROUPOBJUSED:
+ return (DMU_GROUPUSED_OBJECT);
+ case ZFS_PROP_PROJECTUSED:
+ case ZFS_PROP_PROJECTOBJUSED:
+ return (DMU_PROJECTUSED_OBJECT);
+ case ZFS_PROP_USERQUOTA:
+ return (zfsvfs->z_userquota_obj);
+ case ZFS_PROP_GROUPQUOTA:
+ return (zfsvfs->z_groupquota_obj);
+ case ZFS_PROP_USEROBJQUOTA:
+ return (zfsvfs->z_userobjquota_obj);
+ case ZFS_PROP_GROUPOBJQUOTA:
+ return (zfsvfs->z_groupobjquota_obj);
+ case ZFS_PROP_PROJECTQUOTA:
+ return (zfsvfs->z_projectquota_obj);
+ case ZFS_PROP_PROJECTOBJQUOTA:
+ return (zfsvfs->z_projectobjquota_obj);
+ default:
+ return (ZFS_NO_OBJECT);
+ }
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+ int error;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zfs_useracct_t *buf = vbuf;
+ uint64_t obj;
+ int offset = 0;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED) &&
+ !dmu_objset_projectquota_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA) &&
+ !dmu_objset_userobjspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == ZFS_NO_OBJECT) {
+ *bufsizep = 0;
+ return (0);
+ }
+
+ if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJUSED)
+ offset = DMU_OBJACCT_PREFIX_LEN;
+
+ for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+ (error = zap_cursor_retrieve(&zc, &za)) == 0;
+ zap_cursor_advance(&zc)) {
+ if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+ *bufsizep)
+ break;
+
+ /*
+ * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX)
+ * when dealing with block quota and vice versa.
+ */
+ if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX,
+ DMU_OBJACCT_PREFIX_LEN) == 0))
+ continue;
+
+ fuidstr_to_sid(zfsvfs, za.za_name + offset,
+ buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+ buf->zu_space = za.za_first_integer;
+ buf++;
+ }
+ if (error == ENOENT)
+ error = 0;
+
+ ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+ *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+ *cookiep = zap_cursor_serialize(&zc);
+ zap_cursor_fini(&zc);
+ return (error);
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+static int
+id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+ char *buf, boolean_t addok)
+{
+ uint64_t fuid;
+ int domainid = 0;
+
+ if (domain && domain[0]) {
+ domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+ if (domainid == -1)
+ return (SET_ERROR(ENOENT));
+ }
+ fuid = FUID_ENCODE(domainid, rid);
+ (void) sprintf(buf, "%llx", (longlong_t)fuid);
+ return (0);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t *valp)
+{
+ char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+ int offset = 0;
+ int err;
+ uint64_t obj;
+
+ *valp = 0;
+
+ if (!dmu_objset_userspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA) &&
+ !dmu_objset_userobjspace_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+
+ if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
+ type == ZFS_PROP_PROJECTOBJQUOTA ||
+ type == ZFS_PROP_PROJECTOBJUSED) {
+ if (!dmu_objset_projectquota_present(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+ if (!zpl_is_valid_projid(rid))
+ return (SET_ERROR(EINVAL));
+ }
+
+ obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+ if (obj == ZFS_NO_OBJECT)
+ return (0);
+
+ if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+ type == ZFS_PROP_PROJECTOBJUSED) {
+ strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+ offset = DMU_OBJACCT_PREFIX_LEN;
+ }
+
+ err = id_to_fuidstr(zfsvfs, domain, rid, buf + offset, B_FALSE);
+ if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+ if (err == ENOENT)
+ err = 0;
+ return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+ const char *domain, uint64_t rid, uint64_t quota)
+{
+ char buf[32];
+ int err;
+ dmu_tx_t *tx;
+ uint64_t *objp;
+ boolean_t fuid_dirtied;
+
+ if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+ return (SET_ERROR(ENOTSUP));
+
+ switch (type) {
+ case ZFS_PROP_USERQUOTA:
+ objp = &zfsvfs->z_userquota_obj;
+ break;
+ case ZFS_PROP_GROUPQUOTA:
+ objp = &zfsvfs->z_groupquota_obj;
+ break;
+ case ZFS_PROP_USEROBJQUOTA:
+ objp = &zfsvfs->z_userobjquota_obj;
+ break;
+ case ZFS_PROP_GROUPOBJQUOTA:
+ objp = &zfsvfs->z_groupobjquota_obj;
+ break;
+ case ZFS_PROP_PROJECTQUOTA:
+ if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+ if (!zpl_is_valid_projid(rid))
+ return (SET_ERROR(EINVAL));
+
+ objp = &zfsvfs->z_projectquota_obj;
+ break;
+ case ZFS_PROP_PROJECTOBJQUOTA:
+ if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ return (SET_ERROR(ENOTSUP));
+ if (!zpl_is_valid_projid(rid))
+ return (SET_ERROR(EINVAL));
+
+ objp = &zfsvfs->z_projectobjquota_obj;
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
+ if (err)
+ return (err);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+ if (*objp == 0) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ zfs_userquota_prop_prefixes[type]);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ mutex_enter(&zfsvfs->z_lock);
+ if (*objp == 0) {
+ *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+ DMU_OT_NONE, 0, tx);
+ VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+ }
+ mutex_exit(&zfsvfs->z_lock);
+
+ if (quota == 0) {
+ err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+ if (err == ENOENT)
+ err = 0;
+ } else {
+ err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+ }
+ ASSERT(err == 0);
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+ dmu_tx_commit(tx);
+ return (err);
+}
+
+boolean_t
+zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+ char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+ uint64_t used, quota, quotaobj;
+ int err;
+
+ if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
+ if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
+ return (B_FALSE);
+ }
+
+ if (usedobj == DMU_PROJECTUSED_OBJECT) {
+ if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
+ if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
+ return (B_FALSE);
+ }
+ quotaobj = zfsvfs->z_projectobjquota_obj;
+ } else if (usedobj == DMU_USERUSED_OBJECT) {
+ quotaobj = zfsvfs->z_userobjquota_obj;
+ } else if (usedobj == DMU_GROUPUSED_OBJECT) {
+ quotaobj = zfsvfs->z_groupobjquota_obj;
+ } else {
+ return (B_FALSE);
+ }
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ (void) sprintf(buf, "%llx", (longlong_t)id);
+ err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+ if (err != 0)
+ return (B_FALSE);
+
+ (void) sprintf(buf, DMU_OBJACCT_PREFIX "%llx", (longlong_t)id);
+ err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+ if (err != 0)
+ return (B_FALSE);
+ return (used >= quota);
+}
+
+boolean_t
+zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+ char buf[20];
+ uint64_t used, quota, quotaobj;
+ int err;
+
+ if (usedobj == DMU_PROJECTUSED_OBJECT) {
+ if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
+ if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
+ dsl_pool_config_enter(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+ dsl_pool_config_exit(
+ dmu_objset_pool(zfsvfs->z_os), FTAG);
+ }
+ return (B_FALSE);
+ }
+ quotaobj = zfsvfs->z_projectquota_obj;
+ } else if (usedobj == DMU_USERUSED_OBJECT) {
+ quotaobj = zfsvfs->z_userquota_obj;
+ } else if (usedobj == DMU_GROUPUSED_OBJECT) {
+ quotaobj = zfsvfs->z_groupquota_obj;
+ } else {
+ return (B_FALSE);
+ }
+ if (quotaobj == 0 || zfsvfs->z_replay)
+ return (B_FALSE);
+
+ (void) sprintf(buf, "%llx", (longlong_t)id);
+ err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+ if (err != 0)
+ return (B_FALSE);
+
+ err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+ if (err != 0)
+ return (B_FALSE);
+ return (used >= quota);
+}
+
+boolean_t
+zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+ return (zfs_id_overblockquota(zfsvfs, usedobj, id) ||
+ zfs_id_overobjquota(zfsvfs, usedobj, id));
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+ uint64_t val;
+
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+ zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+ zfsvfs->z_os = os;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+ if (error != 0)
+ return (error);
+ if (zfsvfs->z_version >
+ zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+ (void) printk("Can't mount a version %lld file system "
+ "on a version %lld pool\n. Pool must be upgraded to mount "
+ "this file system.\n", (u_longlong_t)zfsvfs->z_version,
+ (u_longlong_t)spa_version(dmu_objset_spa(os)));
+ return (SET_ERROR(ENOTSUP));
+ }
+ error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_norm = (int)val;
+
+ error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_utf8 = (val != 0);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)val;
+
+ if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0)
+ return (error);
+ zfsvfs->z_acl_type = (uint_t)val;
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ zfsvfs->z_case == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+ uint64_t sa_obj = 0;
+ if (zfsvfs->z_use_sa) {
+ /* should either have both of these objects or none */
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+ &sa_obj);
+ if (error != 0)
+ return (error);
+
+ error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
+ if ((error == 0) && (val == ZFS_XATTR_SA))
+ zfsvfs->z_xattr_sa = B_TRUE;
+ }
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+ &zfsvfs->z_root);
+ if (error != 0)
+ return (error);
+ ASSERT(zfsvfs->z_root != 0);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+ &zfsvfs->z_unlinkedobj);
+ if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+ 8, 1, &zfsvfs->z_userquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+ 8, 1, &zfsvfs->z_groupquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+ 8, 1, &zfsvfs->z_projectquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+ 8, 1, &zfsvfs->z_userobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_userobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+ 8, 1, &zfsvfs->z_groupobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_groupobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ,
+ zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+ 8, 1, &zfsvfs->z_projectobjquota_obj);
+ if (error == ENOENT)
+ zfsvfs->z_projectobjquota_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+ &zfsvfs->z_fuid_obj);
+ if (error == ENOENT)
+ zfsvfs->z_fuid_obj = 0;
+ else if (error != 0)
+ return (error);
+
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+ &zfsvfs->z_shares_dir);
+ if (error == ENOENT)
+ zfsvfs->z_shares_dir = 0;
+ else if (error != 0)
+ return (error);
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+ if (error != 0)
+ return (error);
+
+ if (zfsvfs->z_version >= ZPL_VERSION_SA)
+ sa_register_update_callback(os, zfs_sa_upgrade);
+
+ return (0);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ int error;
+ boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
+ if (error != 0) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+ if (error != 0) {
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ }
+ return (error);
+}
+
+
+/*
+ * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function
+ * on a failure. Do not pass in a statically allocated zfsvfs.
+ */
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+ int error;
+
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_sb = NULL;
+ zfsvfs->z_parent = zfsvfs;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+ rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+ rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+
+ int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
+ ZFS_OBJ_MTX_MAX);
+ zfsvfs->z_hold_size = size;
+ zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+ KM_SLEEP);
+ zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+ for (int i = 0; i != size; i++) {
+ avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+ sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+ mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ error = zfsvfs_init(zfsvfs, os);
+ if (error != 0) {
+ *zfvp = NULL;
+ zfsvfs_free(zfsvfs);
+ return (error);
+ }
+
+ zfsvfs->z_drain_task = TASKQID_INVALID;
+ zfsvfs->z_draining = B_FALSE;
+ zfsvfs->z_drain_cancel = B_TRUE;
+
+ *zfvp = zfsvfs;
+ return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+ int error;
+ boolean_t readonly = zfs_is_readonly(zfsvfs);
+
+ error = zfs_register_callbacks(zfsvfs->z_vfs);
+ if (error)
+ return (error);
+
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+ /*
+ * If we are not mounting (ie: online recv), then we don't
+ * have to worry about replaying the log as we blocked all
+ * operations out since we closed the ZIL.
+ */
+ if (mounting) {
+ ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+ dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+ /*
+ * During replay we remove the read only flag to
+ * allow replays to succeed.
+ */
+ if (readonly != 0) {
+ readonly_changed_cb(zfsvfs, B_FALSE);
+ } else {
+ zap_stats_t zs;
+ if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+ &zs) == 0) {
+ dataset_kstats_update_nunlinks_kstat(
+ &zfsvfs->z_kstat, zs.zs_num_entries);
+ }
+ dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+ "num_entries in unlinked set: %llu",
+ zs.zs_num_entries);
+ zfs_unlinked_drain(zfsvfs);
+ }
+
+ /*
+ * Parse and replay the intent log.
+ *
+ * Because of ziltest, this must be done after
+ * zfs_unlinked_drain(). (Further note: ziltest
+ * doesn't use readonly mounts, where
+ * zfs_unlinked_drain() isn't called.) This is because
+ * ziltest causes spa_sync() to think it's committed,
+ * but actually it is not, so the intent log contains
+ * many txg's worth of changes.
+ *
+ * In particular, if object N is in the unlinked set in
+ * the last txg to actually sync, then it could be
+ * actually freed in a later txg and then reallocated
+ * in a yet later txg. This would write a "create
+ * object N" record to the intent log. Normally, this
+ * would be fine because the spa_sync() would have
+ * written out the fact that object N is free, before
+ * we could write the "create object N" intent log
+ * record.
+ *
+ * But when we are in ziltest mode, we advance the "open
+ * txg" without actually spa_sync()-ing the changes to
+ * disk. So we would see that object N is still
+ * allocated and in the unlinked set, and there is an
+ * intent log record saying to allocate it.
+ */
+ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+ if (zil_replay_disable) {
+ zil_destroy(zfsvfs->z_log, B_FALSE);
+ } else {
+ zfsvfs->z_replay = B_TRUE;
+ zil_replay(zfsvfs->z_os, zfsvfs,
+ zfs_replay_vector);
+ zfsvfs->z_replay = B_FALSE;
+ }
+ }
+
+ /* restore readonly bit */
+ if (readonly != 0)
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ }
+
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+ return (0);
+}
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+ int i, size = zfsvfs->z_hold_size;
+
+ zfs_fuid_destroy(zfsvfs);
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+ mutex_destroy(&zfsvfs->z_lock);
+ list_destroy(&zfsvfs->z_all_znodes);
+ rrm_destroy(&zfsvfs->z_teardown_lock);
+ rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+ rw_destroy(&zfsvfs->z_fuid_lock);
+ for (i = 0; i != size; i++) {
+ avl_destroy(&zfsvfs->z_hold_trees[i]);
+ mutex_destroy(&zfsvfs->z_hold_locks[i]);
+ }
+ vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+ vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+ zfsvfs_vfs_free(zfsvfs->z_vfs);
+ dataset_kstats_destroy(&zfsvfs->z_kstat);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+ zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+ zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+
+ if (!dmu_objset_is_snapshot(os))
+ dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef HAVE_MLSLABEL
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+ if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+ return (0);
+ if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+ /* must be readonly */
+ uint64_t rdonly;
+
+ if (dsl_prop_get_integer(dsname,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+ return (SET_ERROR(EACCES));
+ return (rdonly ? 0 : EACCES);
+ }
+ return (SET_ERROR(EACCES));
+}
+#endif /* HAVE_MLSLABEL */
+
+static int
+zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
+ uint32_t bshift)
+{
+ char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+ uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
+ uint64_t quota;
+ uint64_t used;
+ int err;
+
+ strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+ err = id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, B_FALSE);
+ if (err)
+ return (err);
+
+ if (zfsvfs->z_projectquota_obj == 0)
+ goto objs;
+
+ err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
+ buf + offset, 8, 1, &quota);
+ if (err == ENOENT)
+ goto objs;
+ else if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+ buf + offset, 8, 1, &used);
+ if (unlikely(err == ENOENT)) {
+ uint32_t blksize;
+ u_longlong_t nblocks;
+
+ /*
+ * Quota accounting is async, so it is possible race case.
+ * There is at least one object with the given project ID.
+ */
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ if (unlikely(zp->z_blksz == 0))
+ blksize = zfsvfs->z_max_blksz;
+
+ used = blksize * nblocks;
+ } else if (err) {
+ return (err);
+ }
+
+ statp->f_blocks = quota >> bshift;
+ statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
+ statp->f_bavail = statp->f_bfree;
+
+objs:
+ if (zfsvfs->z_projectobjquota_obj == 0)
+ return (0);
+
+ err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
+ buf + offset, 8, 1, &quota);
+ if (err == ENOENT)
+ return (0);
+ else if (err)
+ return (err);
+
+ err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+ buf, 8, 1, &used);
+ if (unlikely(err == ENOENT)) {
+ /*
+ * Quota accounting is async, so it is possible race case.
+ * There is at least one object with the given project ID.
+ */
+ used = 1;
+ } else if (err) {
+ return (err);
+ }
+
+ statp->f_files = quota;
+ statp->f_ffree = (quota > used) ? (quota - used) : 0;
+
+ return (0);
+}
+
+int
+zfs_statvfs(struct dentry *dentry, struct kstatfs *statp)
+{
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+ int err = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ dmu_objset_space(zfsvfs->z_os,
+ &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+ uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os);
+ /*
+ * The underlying storage pool actually uses multiple block
+ * size. Under Solaris frsize (fragment size) is reported as
+ * the smallest block size we support, and bsize (block size)
+ * as the filesystem's maximum block size. Unfortunately,
+ * under Linux the fragment size and block size are often used
+ * interchangeably. Thus we are forced to report both of them
+ * as the filesystem's maximum block size.
+ */
+ statp->f_frsize = zfsvfs->z_max_blksz;
+ statp->f_bsize = zfsvfs->z_max_blksz;
+ uint32_t bshift = fls(statp->f_bsize) - 1;
+
+ /*
+ * The following report "total" blocks of various kinds in
+ * the file system, but reported in terms of f_bsize - the
+ * "preferred" size.
+ */
+
+ /* Round up so we never have a filesystem using 0 blocks. */
+ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize);
+ statp->f_blocks = (refdbytes + availbytes) >> bshift;
+ statp->f_bfree = availbytes >> bshift;
+ statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+ /*
+ * statvfs() should really be called statufs(), because it assumes
+ * static metadata. ZFS doesn't preallocate files, so the best
+ * we can do is report the max that could possibly fit in f_files,
+ * and that minus the number actually used in f_ffree.
+ * For f_ffree, report the smaller of the number of objects available
+ * and the number of blocks (each object will take at least a block).
+ */
+ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT);
+ statp->f_files = statp->f_ffree + usedobjs;
+ statp->f_fsid.val[0] = (uint32_t)fsid;
+ statp->f_fsid.val[1] = (uint32_t)(fsid >> 32);
+ statp->f_type = ZFS_SUPER_MAGIC;
+ statp->f_namelen = MAXNAMELEN - 1;
+
+ /*
+ * We have all of 40 characters to stuff a string here.
+ * Is there anything useful we could/should provide?
+ */
+ bzero(statp->f_spare, sizeof (statp->f_spare));
+
+ if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ dmu_objset_projectquota_present(zfsvfs->z_os)) {
+ znode_t *zp = ITOZ(dentry->d_inode);
+
+ if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
+ zpl_is_valid_projid(zp->z_projid))
+ err = zfs_statfs_project(zfsvfs, zp, statp, bshift);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+int
+zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
+{
+ znode_t *rootzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+ if (error == 0)
+ *ipp = ZTOI(rootzp);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifdef HAVE_D_PRUNE_ALIASES
+/*
+ * Linux kernels older than 3.1 do not support a per-filesystem shrinker.
+ * To accommodate this we must improvise and manually walk the list of znodes
+ * attempting to prune dentries in order to be able to drop the inodes.
+ *
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list. New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+ znode_t **zp_array, *zp;
+ int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+ int objects = 0;
+ int i = 0, j = 0;
+
+ zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+ if ((i++ > nr_to_scan) || (j >= max_array))
+ break;
+
+ ASSERT(list_link_active(&zp->z_link_node));
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+ /* Skip active znodes and .zfs entries */
+ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+ continue;
+
+ if (igrab(ZTOI(zp)) == NULL)
+ continue;
+
+ zp_array[j] = zp;
+ j++;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ for (i = 0; i < j; i++) {
+ zp = zp_array[i];
+
+ ASSERT3P(zp, !=, NULL);
+ d_prune_aliases(ZTOI(zp));
+
+ if (atomic_read(&ZTOI(zp)->i_count) == 1)
+ objects++;
+
+ iput(ZTOI(zp));
+ }
+
+ kmem_free(zp_array, max_array * sizeof (znode_t *));
+
+ return (objects);
+}
+#endif /* HAVE_D_PRUNE_ALIASES */
+
+/*
+ * The ARC has requested that the filesystem drop entries from the dentry
+ * and inode caches. This can occur when the ARC needs to free meta data
+ * blocks but can't because they are all pinned by entries in these caches.
+ */
+int
+zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ int error = 0;
+#if defined(HAVE_SHRINK) || defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+ struct shrinker *shrinker = &sb->s_shrink;
+ struct shrink_control sc = {
+ .nr_to_scan = nr_to_scan,
+ .gfp_mask = GFP_KERNEL,
+ };
+#endif
+
+ ZFS_ENTER(zfsvfs);
+
+#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
+ defined(SHRINK_CONTROL_HAS_NID) && \
+ defined(SHRINKER_NUMA_AWARE)
+ if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
+ *objects = 0;
+ for_each_online_node(sc.nid) {
+ *objects += (*shrinker->scan_objects)(shrinker, &sc);
+ }
+ } else {
+ *objects = (*shrinker->scan_objects)(shrinker, &sc);
+ }
+
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+ *objects = (*shrinker->scan_objects)(shrinker, &sc);
+#elif defined(HAVE_SHRINK)
+ *objects = (*shrinker->shrink)(shrinker, &sc);
+#elif defined(HAVE_D_PRUNE_ALIASES)
+#define D_PRUNE_ALIASES_IS_DEFAULT
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#else
+#error "No available dentry and inode cache pruning mechanism."
+#endif
+
+#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT)
+#undef D_PRUNE_ALIASES_IS_DEFAULT
+ /*
+ * Fall back to zfs_prune_aliases if the kernel's per-superblock
+ * shrinker couldn't free anything, possibly due to the inodes being
+ * allocated in a different memcg.
+ */
+ if (*objects == 0)
+ *objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#endif
+
+ ZFS_EXIT(zfsvfs);
+
+ dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+ "pruning, nr_to_scan=%lu objects=%d error=%d\n",
+ nr_to_scan, *objects, error);
+
+ return (error);
+}
+
+/*
+ * Teardown the zfsvfs_t.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+ znode_t *zp;
+
+ zfs_unlinked_drain_stop_wait(zfsvfs);
+
+ /*
+ * If someone has not already unmounted this file system,
+ * drain the iput_taskq to ensure all active references to the
+ * zfsvfs_t have been handled only then can it be safely destroyed.
+ */
+ if (zfsvfs->z_os) {
+ /*
+ * If we're unmounting we have to wait for the list to
+ * drain completely.
+ *
+ * If we're not unmounting there's no guarantee the list
+ * will drain completely, but iputs run from the taskq
+ * may add the parents of dir-based xattrs to the taskq
+ * so we want to wait for these.
+ *
+ * We can safely read z_nr_znodes without locking because the
+ * VFS has already blocked operations which add to the
+ * z_all_znodes list and thus increment z_nr_znodes.
+ */
+ int round = 0;
+ while (zfsvfs->z_nr_znodes > 0) {
+ taskq_wait_outstanding(dsl_pool_iput_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ if (++round > 1 && !unmounting)
+ break;
+ }
+ }
+
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+
+ if (!unmounting) {
+ /*
+ * We purge the parent filesystem's super block as the
+ * parent filesystem and all of its snapshots have their
+ * inode's super block set to the parent's filesystem's
+ * super block. Note, 'z_parent' is self referential
+ * for non-snapshots.
+ */
+ shrink_dcache_sb(zfsvfs->z_parent->z_sb);
+ }
+
+ /*
+ * Close the zil. NB: Can't close the zil while zfs_inactive
+ * threads are blocked as zil_close can call zfs_inactive.
+ */
+ if (zfsvfs->z_log) {
+ zil_close(zfsvfs->z_log);
+ zfsvfs->z_log = NULL;
+ }
+
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+ /*
+ * If we are not unmounting (ie: online recv) and someone already
+ * unmounted this file system while we were doing the switcheroo,
+ * or a reopen of z_os failed then just bail out now.
+ */
+ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ return (SET_ERROR(EIO));
+ }
+
+ /*
+ * At this point there are no VFS ops active, and any new VFS ops
+ * will fail with EIO since we have z_teardown_lock for writer (only
+ * relevant for forced unmount).
+ *
+ * Release all holds on dbufs. We also grab an extra reference to all
+ * the remaining inodes so that the kernel does not attempt to free
+ * any inodes of a suspended fs. This can cause deadlocks since the
+ * zfs_resume_fs() process may involve starting threads, which might
+ * attempt to free unreferenced inodes to free up memory for the new
+ * thread.
+ */
+ if (!unmounting) {
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ if (zp->z_sa_hdl)
+ zfs_znode_dmu_fini(zp);
+ if (igrab(ZTOI(zp)) != NULL)
+ zp->z_suspended = B_TRUE;
+
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+ }
+
+ /*
+ * If we are unmounting, set the unmounted flag and let new VFS ops
+ * unblock. zfs_inactive will have the unmounted behavior, and all
+ * other VFS ops will fail with EIO.
+ */
+ if (unmounting) {
+ zfsvfs->z_unmounted = B_TRUE;
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+ }
+
+ /*
+ * z_os will be NULL if there was an error in attempting to reopen
+ * zfsvfs, so just return as the properties had already been
+ *
+ * unregistered and cached data had been evicted before.
+ */
+ if (zfsvfs->z_os == NULL)
+ return (0);
+
+ /*
+ * Unregister properties.
+ */
+ zfs_unregister_callbacks(zfsvfs);
+
+ /*
+ * Evict cached data. We must write out any dirty data before
+ * disowning the dataset.
+ */
+ objset_t *os = zfsvfs->z_os;
+ boolean_t os_dirty = B_FALSE;
+ for (int t = 0; t < TXG_SIZE; t++) {
+ if (dmu_objset_is_dirty(os, t)) {
+ os_dirty = B_TRUE;
+ break;
+ }
+ }
+ if (!zfs_is_readonly(zfsvfs) && os_dirty) {
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+ }
+ dmu_objset_evict_dbufs(zfsvfs->z_os);
+
+ return (0);
+}
+
+#if !defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) && \
+ !defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER)
+atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
+#endif
+
+int
+zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
+{
+ const char *osname = zm->mnt_osname;
+ struct inode *root_inode;
+ uint64_t recordsize;
+ int error = 0;
+ zfsvfs_t *zfsvfs = NULL;
+ vfs_t *vfs = NULL;
+
+ ASSERT(zm);
+ ASSERT(osname);
+
+ error = zfsvfs_parse_options(zm->mnt_data, &vfs);
+ if (error)
+ return (error);
+
+ error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
+ if (error) {
+ zfsvfs_vfs_free(vfs);
+ goto out;
+ }
+
+ if ((error = dsl_prop_get_integer(osname, "recordsize",
+ &recordsize, NULL))) {
+ zfsvfs_vfs_free(vfs);
+ goto out;
+ }
+
+ vfs->vfs_data = zfsvfs;
+ zfsvfs->z_vfs = vfs;
+ zfsvfs->z_sb = sb;
+ sb->s_fs_info = zfsvfs;
+ sb->s_magic = ZFS_SUPER_MAGIC;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_blocksize = recordsize;
+ sb->s_blocksize_bits = ilog2(recordsize);
+
+ error = -zpl_bdi_setup(sb, "zfs");
+ if (error)
+ goto out;
+
+ sb->s_bdi->ra_pages = 0;
+
+ /* Set callback operations for the file system. */
+ sb->s_op = &zpl_super_operations;
+ sb->s_xattr = zpl_xattr_handlers;
+ sb->s_export_op = &zpl_export_operations;
+#ifdef HAVE_S_D_OP
+ sb->s_d_op = &zpl_dentry_operations;
+#endif /* HAVE_S_D_OP */
+
+ /* Set features for file system. */
+ zfs_set_fuid_feature(zfsvfs);
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+ uint64_t pval;
+
+ atime_changed_cb(zfsvfs, B_FALSE);
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ if ((error = dsl_prop_get_integer(osname,
+ "xattr", &pval, NULL)))
+ goto out;
+ xattr_changed_cb(zfsvfs, pval);
+ if ((error = dsl_prop_get_integer(osname,
+ "acltype", &pval, NULL)))
+ goto out;
+ acltype_changed_cb(zfsvfs, pval);
+ zfsvfs->z_issnap = B_TRUE;
+ zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+ zfsvfs->z_snap_defer_time = jiffies;
+
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+ } else {
+ if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+ goto out;
+ }
+
+ /* Allocate a root inode for the filesystem. */
+ error = zfs_root(zfsvfs, &root_inode);
+ if (error) {
+ (void) zfs_umount(sb);
+ goto out;
+ }
+
+ /* Allocate a root dentry for the filesystem */
+ sb->s_root = d_make_root(root_inode);
+ if (sb->s_root == NULL) {
+ (void) zfs_umount(sb);
+ error = SET_ERROR(ENOMEM);
+ goto out;
+ }
+
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+
+ zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
+out:
+ if (error) {
+ if (zfsvfs != NULL) {
+ dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+ zfsvfs_free(zfsvfs);
+ }
+ /*
+ * make sure we don't have dangling sb->s_fs_info which
+ * zfs_preumount will use.
+ */
+ sb->s_fs_info = NULL;
+ }
+
+ return (error);
+}
+
+/*
+ * Called when an unmount is requested and certain sanity checks have
+ * already passed. At this point no dentries or inodes have been reclaimed
+ * from their respective caches. We drop the extra reference on the .zfs
+ * control directory to allow everything to be reclaimed. All snapshots
+ * must already have been unmounted to reach this point.
+ */
+void
+zfs_preumount(struct super_block *sb)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+ /* zfsvfs is NULL when zfs_domount fails during mount */
+ if (zfsvfs) {
+ zfs_unlinked_drain_stop_wait(zfsvfs);
+ zfsctl_destroy(sb->s_fs_info);
+ /*
+ * Wait for iput_async before entering evict_inodes in
+ * generic_shutdown_super. The reason we must finish before
+ * evict_inodes is when lazytime is on, or when zfs_purgedir
+ * calls zfs_zget, iput would bump i_count from 0 to 1. This
+ * would race with the i_count check in evict_inodes. This means
+ * it could destroy the inode while we are still using it.
+ *
+ * We wait for two passes. xattr directories in the first pass
+ * may add xattr entries in zfs_purgedir, so in the second pass
+ * we wait for them. We don't use taskq_wait here because it is
+ * a pool wide taskq. Other mounted filesystems can constantly
+ * do iput_async and there's no guarantee when taskq will be
+ * empty.
+ */
+ taskq_wait_outstanding(dsl_pool_iput_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ taskq_wait_outstanding(dsl_pool_iput_taskq(
+ dmu_objset_pool(zfsvfs->z_os)), 0);
+ }
+}
+
+/*
+ * Called once all other unmount released tear down has occurred.
+ * It is our responsibility to release any remaining infrastructure.
+ */
+/*ARGSUSED*/
+int
+zfs_umount(struct super_block *sb)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ objset_t *os;
+
+ if (zfsvfs->z_arc_prune != NULL)
+ arc_remove_prune_callback(zfsvfs->z_arc_prune);
+ VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+ os = zfsvfs->z_os;
+ zpl_bdi_destroy(sb);
+
+ /*
+ * z_os will be NULL if there was an error in
+ * attempting to reopen zfsvfs.
+ */
+ if (os != NULL) {
+ /*
+ * Unset the objset user_ptr.
+ */
+ mutex_enter(&os->os_user_ptr_lock);
+ dmu_objset_set_user(os, NULL);
+ mutex_exit(&os->os_user_ptr_lock);
+
+ /*
+ * Finally release the objset
+ */
+ dmu_objset_disown(os, B_TRUE, zfsvfs);
+ }
+
+ zfsvfs_free(zfsvfs);
+ return (0);
+}
+
+int
+zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ vfs_t *vfsp;
+ boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os);
+ int error;
+
+ if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) &&
+ !(*flags & SB_RDONLY)) {
+ *flags |= SB_RDONLY;
+ return (EROFS);
+ }
+
+ error = zfsvfs_parse_options(zm->mnt_data, &vfsp);
+ if (error)
+ return (error);
+
+ if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY))
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+
+ zfs_unregister_callbacks(zfsvfs);
+ zfsvfs_vfs_free(zfsvfs->z_vfs);
+
+ vfsp->vfs_data = zfsvfs;
+ zfsvfs->z_vfs = vfsp;
+ if (!issnap)
+ (void) zfs_register_callbacks(vfsp);
+
+ return (error);
+}
+
+int
+zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
+{
+ zfsvfs_t *zfsvfs = sb->s_fs_info;
+ znode_t *zp;
+ uint64_t object = 0;
+ uint64_t fid_gen = 0;
+ uint64_t gen_mask;
+ uint64_t zp_gen;
+ int i, err;
+
+ *ipp = NULL;
+
+ if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+ zfid_short_t *zfid = (zfid_short_t *)fidp;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+ } else {
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* LONG_FID_LEN means snapdirs */
+ if (fidp->fid_len == LONG_FID_LEN) {
+ zfid_long_t *zlfid = (zfid_long_t *)fidp;
+ uint64_t objsetid = 0;
+ uint64_t setgen = 0;
+
+ for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+ objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+ for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+ setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+ if (objsetid != ZFSCTL_INO_SNAPDIRS - object) {
+ dprintf("snapdir fid: objsetid (%llu) != "
+ "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n",
+ objsetid, ZFSCTL_INO_SNAPDIRS, object);
+
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (fid_gen > 1 || setgen != 0) {
+ dprintf("snapdir fid: fid_gen (%llu) and setgen "
+ "(%llu)\n", fid_gen, setgen);
+ return (SET_ERROR(EINVAL));
+ }
+
+ return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp));
+ }
+
+ ZFS_ENTER(zfsvfs);
+ /* A zero fid_gen means we are in the .zfs control directories */
+ if (fid_gen == 0 &&
+ (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+ *ipp = zfsvfs->z_ctldir;
+ ASSERT(*ipp != NULL);
+ if (object == ZFSCTL_INO_SNAPDIR) {
+ VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+ 0, kcred, NULL, NULL) == 0);
+ } else {
+ igrab(*ipp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ gen_mask = -1ULL >> (64 - 8 * i);
+
+ dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask);
+ if ((err = zfs_zget(zfsvfs, object, &zp))) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ /* Don't export xattr stuff */
+ if (zp->z_pflags & ZFS_XATTR) {
+ iput(ZTOI(zp));
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOENT));
+ }
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+ sizeof (uint64_t));
+ zp_gen = zp_gen & gen_mask;
+ if (zp_gen == 0)
+ zp_gen = 1;
+ if ((fid_gen == 0) && (zfsvfs->z_root == object))
+ fid_gen = zp_gen;
+ if (zp->z_unlinked || zp_gen != fid_gen) {
+ dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
+ fid_gen);
+ iput(ZTOI(zp));
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOENT));
+ }
+
+ *ipp = ZTOI(zp);
+ if (*ipp)
+ zfs_inode_update(ITOZ(*ipp));
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Block out VFS ops and close zfsvfs_t
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+ int error;
+
+ if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+ return (error);
+
+ return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended. Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ int err, err2;
+ znode_t *zp;
+
+ ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ /*
+ * We already own this, so just update the objset_t, as the one we
+ * had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+
+ err = zfsvfs_init(zfsvfs, os);
+ if (err != 0)
+ goto bail;
+
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+ zfs_set_fuid_feature(zfsvfs);
+ zfsvfs->z_rollback_time = jiffies;
+
+ /*
+ * Attempt to re-establish all the active inodes with their
+ * dbufs. If a zfs_rezget() fails, then we unhash the inode
+ * and mark it stale. This prevents a collision if a new
+ * inode/object is created which must use the same inode
+ * number. The stale inode will be be released when the
+ * VFS prunes the dentry holding the remaining references
+ * on the stale inode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ err2 = zfs_rezget(zp);
+ if (err2) {
+ remove_inode_hash(ZTOI(zp));
+ zp->z_is_stale = B_TRUE;
+ }
+
+ /* see comment in zfs_suspend_fs() */
+ if (zp->z_suspended) {
+ zfs_iput_async(ZTOI(zp));
+ zp->z_suspended = B_FALSE;
+ }
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
+ /*
+ * zfs_suspend_fs() could have interrupted freeing
+ * of dnodes. We need to restart this freeing so
+ * that we don't "leak" the space.
+ */
+ zfs_unlinked_drain(zfsvfs);
+ }
+
+bail:
+ if (err != 0)
+ zfsvfs->z_unmounted = B_TRUE;
+
+ /* release the VFS ops */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+ if (err != 0) {
+ /*
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
+ */
+ if (zfsvfs->z_os)
+ (void) zfs_umount(zfsvfs->z_sb);
+ }
+ return (err);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+ ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+ /*
+ * We already own this, so just hold and rele it to update the
+ * objset_t, as the one we had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ VERIFY0(dmu_objset_from_ds(ds, &os));
+ zfsvfs->z_os = os;
+
+ /* release the VOPs */
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
+
+ /*
+ * Try to force unmount this file system.
+ */
+ (void) zfs_umount(zfsvfs->z_sb);
+ zfsvfs->z_unmounted = B_TRUE;
+ return (0);
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+ int error;
+ objset_t *os = zfsvfs->z_os;
+ dmu_tx_t *tx;
+
+ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+ return (SET_ERROR(EINVAL));
+
+ if (newvers < zfsvfs->z_version)
+ return (SET_ERROR(EINVAL));
+
+ if (zfs_spa_version_map(newvers) >
+ spa_version(dmu_objset_spa(zfsvfs->z_os)))
+ return (SET_ERROR(ENOTSUP));
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+ ZFS_SA_ATTRS);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ return (error);
+ }
+
+ error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+ 8, 1, &newvers, tx);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ return (error);
+ }
+
+ if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+ uint64_t sa_obj;
+
+ ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+ SPA_VERSION_SA);
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, MASTER_NODE_OBJ,
+ ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT0(error);
+
+ VERIFY(0 == sa_set_sa_object(os, sa_obj));
+ sa_register_update_callback(os, zfs_sa_upgrade);
+ }
+
+ spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+ "from %llu to %llu", zfsvfs->z_version, newvers);
+
+ dmu_tx_commit(tx);
+
+ zfsvfs->z_version = newvers;
+ os->os_version = newvers;
+
+ zfs_set_fuid_feature(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+ uint64_t *cached_copy = NULL;
+
+ /*
+ * Figure out where in the objset_t the cached copy would live, if it
+ * is available for the requested property.
+ */
+ if (os != NULL) {
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ cached_copy = &os->os_version;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ cached_copy = &os->os_normalization;
+ break;
+ case ZFS_PROP_UTF8ONLY:
+ cached_copy = &os->os_utf8only;
+ break;
+ case ZFS_PROP_CASE:
+ cached_copy = &os->os_casesensitivity;
+ break;
+ default:
+ break;
+ }
+ }
+ if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+ *value = *cached_copy;
+ return (0);
+ }
+
+ /*
+ * If the property wasn't cached, look up the file system's value for
+ * the property. For the version property, we look up a slightly
+ * different string.
+ */
+ const char *pname;
+ int error = ENOENT;
+ if (prop == ZFS_PROP_VERSION)
+ pname = ZPL_VERSION_STR;
+ else
+ pname = zfs_prop_to_name(prop);
+
+ if (os != NULL) {
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+ }
+
+ if (error == ENOENT) {
+ /* No value set, use the default value */
+ switch (prop) {
+ case ZFS_PROP_VERSION:
+ *value = ZPL_VERSION;
+ break;
+ case ZFS_PROP_NORMALIZE:
+ case ZFS_PROP_UTF8ONLY:
+ *value = 0;
+ break;
+ case ZFS_PROP_CASE:
+ *value = ZFS_CASE_SENSITIVE;
+ break;
+ case ZFS_PROP_ACLTYPE:
+ *value = ZFS_ACLTYPE_OFF;
+ break;
+ default:
+ return (error);
+ }
+ error = 0;
+ }
+
+ /*
+ * If one of the methods for getting the property value above worked,
+ * copy it into the objset_t's cache.
+ */
+ if (error == 0 && cached_copy != NULL) {
+ *cached_copy = *value;
+ }
+
+ return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+ zfsvfs_t *zfvp;
+ boolean_t unmounted = B_FALSE;
+
+ ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+ mutex_enter(&os->os_user_ptr_lock);
+ zfvp = dmu_objset_get_user(os);
+ if (zfvp != NULL && zfvp->z_unmounted)
+ unmounted = B_TRUE;
+ mutex_exit(&os->os_user_ptr_lock);
+
+ return (unmounted);
+}
+
+struct objnode {
+ avl_node_t node;
+ uint64_t obj;
+};
+
+static int
+objnode_compare(const void *o1, const void *o2)
+{
+ const struct objnode *obj1 = o1;
+ const struct objnode *obj2 = o2;
+ if (obj1->obj < obj2->obj)
+ return (-1);
+ if (obj1->obj > obj2->obj)
+ return (1);
+ return (0);
+}
+
+objlist_t *
+zfs_get_deleteq(objset_t *os)
+{
+ objlist_t *deleteq_objlist = objlist_create();
+ uint64_t deleteq_obj;
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ dmu_object_info_t doi;
+
+ ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+ VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi));
+ ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE);
+
+ VERIFY0(zap_lookup(os, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+
+ /*
+ * In order to insert objects into the objlist, they must be in sorted
+ * order. We don't know what order we'll get them out of the ZAP in, so
+ * we insert them into and remove them from an avl_tree_t to sort them.
+ */
+ avl_tree_t at;
+ avl_create(&at, objnode_compare, sizeof (struct objnode),
+ offsetof(struct objnode, node));
+
+ for (zap_cursor_init(&zc, os, deleteq_obj);
+ zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+ struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP);
+ obj->obj = za.za_first_integer;
+ avl_add(&at, obj);
+ }
+ zap_cursor_fini(&zc);
+
+ struct objnode *next, *found = avl_first(&at);
+ while (found != NULL) {
+ next = AVL_NEXT(&at, found);
+ objlist_insert(deleteq_objlist, found->obj);
+ found = next;
+ }
+
+ void *cookie = NULL;
+ while ((found = avl_destroy_nodes(&at, &cookie)) != NULL)
+ kmem_free(found, sizeof (*found));
+ avl_destroy(&at);
+ return (deleteq_objlist);
+}
+
+
+void
+zfs_init(void)
+{
+ zfsctl_init();
+ zfs_znode_init();
+ dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
+ register_filesystem(&zpl_fs_type);
+}
+
+void
+zfs_fini(void)
+{
+ /*
+ * we don't use outstanding because zpl_posix_acl_free might add more.
+ */
+ taskq_wait(system_delay_taskq);
+ taskq_wait(system_taskq);
+ unregister_filesystem(&zpl_fs_type);
+ zfs_znode_fini();
+ zfsctl_fini();
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_suspend_fs);
+EXPORT_SYMBOL(zfs_resume_fs);
+EXPORT_SYMBOL(zfs_userspace_one);
+EXPORT_SYMBOL(zfs_userspace_many);
+EXPORT_SYMBOL(zfs_set_userquota);
+EXPORT_SYMBOL(zfs_id_overblockquota);
+EXPORT_SYMBOL(zfs_id_overobjquota);
+EXPORT_SYMBOL(zfs_id_overquota);
+EXPORT_SYMBOL(zfs_set_version);
+EXPORT_SYMBOL(zfsvfs_create);
+EXPORT_SYMBOL(zfsvfs_free);
+EXPORT_SYMBOL(zfs_is_readonly);
+EXPORT_SYMBOL(zfs_domount);
+EXPORT_SYMBOL(zfs_preumount);
+EXPORT_SYMBOL(zfs_umount);
+EXPORT_SYMBOL(zfs_remount);
+EXPORT_SYMBOL(zfs_statvfs);
+EXPORT_SYMBOL(zfs_vget);
+EXPORT_SYMBOL(zfs_prune);
+#endif
diff --git a/module/os/linux/zfs/zfs_vnops.c b/module/os/linux/zfs/zfs_vnops.c
new file mode 100644
index 000000000..de7b59935
--- /dev/null
+++ b/module/os/linux/zfs/zfs_vnops.c
@@ -0,0 +1,5275 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/sid.h>
+#include <sys/mode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_rlock.h>
+#include <sys/cred.h>
+#include <sys/zpl.h>
+#include <sys/zil.h>
+#include <sys/sa_impl.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work. To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory. The example below illustrates the following Big Rules:
+ *
+ * (1) A check must be made in each zfs thread for a mounted file system.
+ * This is done avoiding races using ZFS_ENTER(zfsvfs).
+ * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * can return EIO from the calling function.
+ *
+ * (2) iput() should always be the last thing except for zil_commit()
+ * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ * First, if it's the last reference, the vnode/znode
+ * can be freed, so the zp may point to freed memory. Second, the last
+ * reference will call zfs_zinactive(), which may induce a lot of work --
+ * pushing cached pages (which acquires range locks) and syncing out
+ * cached atime changes. Third, zfs_zinactive() may require a new tx,
+ * which could deadlock the system if you were already holding one.
+ * If you must call iput() within a tx then use zfs_iput_async().
+ *
+ * (3) All range locks must be grabbed before calling dmu_tx_assign(),
+ * as they can span dmu_tx_assign() calls.
+ *
+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ * dmu_tx_assign(). This is critical because we don't want to block
+ * while holding locks.
+ *
+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * reduces lock contention and CPU usage when we must wait (note that if
+ * throughput is constrained by the storage, nearly every transaction
+ * must wait).
+ *
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing
+ * to use a non-blocking assign can deadlock the system. The scenario:
+ *
+ * Thread A has grabbed a lock before calling dmu_tx_assign().
+ * Thread B is in an already-assigned tx, and blocks for this lock.
+ * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ * forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ * to indicate that this operation has already called dmu_tx_wait().
+ * This will ensure that we don't retry forever, waiting a short bit
+ * each time.
+ *
+ * (5) If the operation succeeded, generate the intent log entry for it
+ * before dropping locks. This ensures that the ordering of events
+ * in the intent log matches the order in which they actually occurred.
+ * During ZIL replay the zfs_log_* functions will update the sequence
+ * number to indicate the zil transaction has replayed.
+ *
+ * (6) At the end of each vnode op, the DMU tx must always commit,
+ * regardless of whether there were any errors.
+ *
+ * (7) After dropping all locks, invoke zil_commit(zilog, foid)
+ * to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ * ZFS_ENTER(zfsvfs); // exit if unmounted
+ * top:
+ * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab())
+ * rw_enter(...); // grab any other locks you need
+ * tx = dmu_tx_create(...); // get DMU tx
+ * dmu_tx_hold_*(); // hold each object you might modify
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ * if (error) {
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * iput(...); // release held vnodes
+ * if (error == ERESTART) {
+ * waited = B_TRUE;
+ * dmu_tx_wait(tx);
+ * dmu_tx_abort(tx);
+ * goto top;
+ * }
+ * dmu_tx_abort(tx); // abort DMU tx
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // really out of space
+ * }
+ * error = do_real_work(); // do whatever this VOP does
+ * if (error == 0)
+ * zfs_log_*(...); // on success, make ZIL entry
+ * dmu_tx_commit(tx); // commit DMU tx -- error or not
+ * rw_exit(...); // drop locks
+ * zfs_dirent_unlock(dl); // unlock directory entry
+ * iput(...); // release held vnodes
+ * zil_commit(zilog, foid); // synchronous when necessary
+ * ZFS_EXIT(zfsvfs); // finished in zfs
+ * return (error); // done, report error
+ */
+
+/*
+ * Virus scanning is unsupported. It would be possible to add a hook
+ * here to performance the required virus scan. This could be done
+ * entirely in the kernel or potentially as an update to invoke a
+ * scanning utility.
+ */
+static int
+zfs_vscan(struct inode *ip, cred_t *cr, int async)
+{
+ return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Honor ZFS_APPENDONLY file attribute */
+ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+ ((flag & O_APPEND) == 0)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /* Virus scan eligible files on open */
+ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+ if (zfs_vscan(ip, cr, 0) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+ }
+
+ /* Keep a count of the synchronous opens in the znode */
+ if (flag & O_SYNC)
+ atomic_inc_32(&zp->z_sync_cnt);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_close(struct inode *ip, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /* Decrement the synchronous opens in the znode */
+ if (flag & O_SYNC)
+ atomic_dec_32(&zp->z_sync_cnt);
+
+ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+ !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+ VERIFY(zfs_vscan(ip, cr, 1) == 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+/*
+ * Lseek support for finding holes (cmd == SEEK_HOLE) and
+ * data (cmd == SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey_common(struct inode *ip, int cmd, loff_t *off)
+{
+ znode_t *zp = ITOZ(ip);
+ uint64_t noff = (uint64_t)*off; /* new offset */
+ uint64_t file_sz;
+ int error;
+ boolean_t hole;
+
+ file_sz = zp->z_size;
+ if (noff >= file_sz) {
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (cmd == SEEK_HOLE)
+ hole = B_TRUE;
+ else
+ hole = B_FALSE;
+
+ error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
+
+ if (error == ESRCH)
+ return (SET_ERROR(ENXIO));
+
+ /* file was dirty, so fall back to using generic logic */
+ if (error == EBUSY) {
+ if (hole)
+ *off = file_sz;
+
+ return (0);
+ }
+
+ /*
+ * We could find a hole that begins after the logical end-of-file,
+ * because dmu_offset_next() only works on whole blocks. If the
+ * EOF falls mid-block, then indicate that the "virtual hole"
+ * at the end of the file begins at the logical EOF, rather than
+ * at the end of the last block.
+ */
+ if (noff > file_sz) {
+ ASSERT(hole);
+ noff = file_sz;
+ }
+
+ if (noff < *off)
+ return (error);
+ *off = noff;
+ return (error);
+}
+
+int
+zfs_holey(struct inode *ip, int cmd, loff_t *off)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_holey_common(ip, cmd, off);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+#if defined(_KERNEL)
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Write: If we find a memory mapped page, we write to *both*
+ * the page and the dmu buffer.
+ */
+static void
+update_pages(struct inode *ip, int64_t start, int len,
+ objset_t *os, uint64_t oid)
+{
+ struct address_space *mp = ip->i_mapping;
+ struct page *pp;
+ uint64_t nbytes;
+ int64_t off;
+ void *pb;
+
+ off = start & (PAGE_SIZE-1);
+ for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+ nbytes = MIN(PAGE_SIZE - off, len);
+
+ pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ if (pp) {
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ pb = kmap(pp);
+ (void) dmu_read(os, oid, start+off, nbytes, pb+off,
+ DMU_READ_PREFETCH);
+ kunmap(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ SetPageUptodate(pp);
+ ClearPageError(pp);
+ unlock_page(pp);
+ put_page(pp);
+ }
+
+ len -= nbytes;
+ off = 0;
+ }
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages. What this means:
+ *
+ * On Read: We "read" preferentially from memory mapped pages,
+ * else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ * the file is memory mapped.
+ */
+static int
+mappedread(struct inode *ip, int nbytes, uio_t *uio)
+{
+ struct address_space *mp = ip->i_mapping;
+ struct page *pp;
+ znode_t *zp = ITOZ(ip);
+ int64_t start, off;
+ uint64_t bytes;
+ int len = nbytes;
+ int error = 0;
+ void *pb;
+
+ start = uio->uio_loffset;
+ off = start & (PAGE_SIZE-1);
+ for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+ bytes = MIN(PAGE_SIZE - off, len);
+
+ pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ if (pp) {
+ ASSERT(PageUptodate(pp));
+ unlock_page(pp);
+
+ pb = kmap(pp);
+ error = uiomove(pb + off, bytes, UIO_READ, uio);
+ kunmap(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ put_page(pp);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, bytes);
+ }
+
+ len -= bytes;
+ off = 0;
+ if (error)
+ break;
+ }
+ return (error);
+}
+#endif /* _KERNEL */
+
+unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */
+unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ * IN: ip - inode of file to be read from.
+ * uio - structure supplying read location, range info,
+ * and return buffer.
+ * ioflag - FSYNC flags; used to provide FRSYNC semantics.
+ * O_DIRECT flag; used to bypass page cache.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range, buffer filled.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Side Effects:
+ * inode - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+int
+zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+{
+ int error = 0;
+ boolean_t frsync = B_FALSE;
+
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ /*
+ * Validate file offset
+ */
+ if (uio->uio_loffset < (offset_t)0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Fasttrack empty reads
+ */
+ if (uio->uio_resid == 0) {
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+#ifdef FRSYNC
+ /*
+ * If we're in FRSYNC mode, sync out this znode before reading it.
+ * Only do this for non-snapshots.
+ *
+ * Some platforms do not support FRSYNC and instead map it
+ * to FSYNC, which results in unnecessary calls to zil_commit. We
+ * only honor FRSYNC requests on platforms which support it.
+ */
+ frsync = !!(ioflag & FRSYNC);
+#endif
+ if (zfsvfs->z_log &&
+ (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+ zil_commit(zfsvfs->z_log, zp->z_id);
+
+ /*
+ * Lock the range against changes.
+ */
+ locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
+ uio->uio_loffset, uio->uio_resid, RL_READER);
+
+ /*
+ * If we are reading past end-of-file we can skip
+ * to the end; but we might still need to set atime.
+ */
+ if (uio->uio_loffset >= zp->z_size) {
+ error = 0;
+ goto out;
+ }
+
+ ASSERT(uio->uio_loffset < zp->z_size);
+ ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
+ ssize_t start_resid = n;
+
+#ifdef HAVE_UIO_ZEROCOPY
+ xuio_t *xuio = NULL;
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+ int nblk;
+ int blksz = zp->z_blksz;
+ uint64_t offset = uio->uio_loffset;
+
+ xuio = (xuio_t *)uio;
+ if ((ISP2(blksz))) {
+ nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+ blksz)) / blksz;
+ } else {
+ ASSERT(offset + n <= blksz);
+ nblk = 1;
+ }
+ (void) dmu_xuio_init(xuio, nblk);
+
+ if (vn_has_cached_data(ip)) {
+ /*
+ * For simplicity, we always allocate a full buffer
+ * even if we only expect to read a portion of a block.
+ */
+ while (--nblk >= 0) {
+ (void) dmu_xuio_add(xuio,
+ dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz), 0, blksz);
+ }
+ }
+ }
+#endif /* HAVE_UIO_ZEROCOPY */
+
+ while (n > 0) {
+ ssize_t nbytes = MIN(n, zfs_read_chunk_size -
+ P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
+
+ if (zp->z_is_mapped && !(ioflag & O_DIRECT)) {
+ error = mappedread(ip, nbytes, uio);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes);
+ }
+
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+
+ n -= nbytes;
+ }
+
+ int64_t nread = start_resid - n;
+ dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+ task_io_account_read(nread);
+out:
+ rangelock_exit(lr);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ * IN: ip - inode of file to be written to.
+ * uio - structure supplying write location, range info,
+ * and data buffer.
+ * ioflag - FAPPEND flag set if in append mode.
+ * O_DIRECT flag; used to bypass page cache.
+ * cr - credentials of caller.
+ *
+ * OUT: uio - updated offset and range.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+int
+zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
+{
+ int error = 0;
+ ssize_t start_resid = uio->uio_resid;
+
+ /*
+ * Fasttrack empty write
+ */
+ ssize_t n = start_resid;
+ if (n == 0)
+ return (0);
+
+ rlim64_t limit = uio->uio_limit;
+ if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
+ limit = MAXOFFSET_T;
+
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+ uint64_t mtime[2], ctime[2];
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
+ * If immutable or not appending then return EPERM
+ */
+ if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+ ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
+ (uio->uio_loffset < zp->z_size))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ /*
+ * Validate file offset
+ */
+ offset_t woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
+ if (woff < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ int max_blksz = zfsvfs->z_max_blksz;
+ xuio_t *xuio = NULL;
+
+ /*
+ * Pre-fault the pages to ensure slow (eg NFS) pages
+ * don't hold up txg.
+ * Skip this if uio contains loaned arc_buf.
+ */
+#ifdef HAVE_UIO_ZEROCOPY
+ if ((uio->uio_extflg == UIO_XUIO) &&
+ (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+ xuio = (xuio_t *)uio;
+ else
+#endif
+ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFAULT));
+ }
+
+ /*
+ * If in append mode, set the io offset pointer to eof.
+ */
+ locked_range_t *lr;
+ if (ioflag & FAPPEND) {
+ /*
+ * Obtain an appending range lock to guarantee file append
+ * semantics. We reset the write offset once we have the lock.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+ woff = lr->lr_offset;
+ if (lr->lr_length == UINT64_MAX) {
+ /*
+ * We overlocked the file because this write will cause
+ * the file block size to increase.
+ * Note that zp_size cannot change with this lock held.
+ */
+ woff = zp->z_size;
+ }
+ uio->uio_loffset = woff;
+ } else {
+ /*
+ * Note that if the file block size will change as a result of
+ * this write, then this range lock will lock the entire file
+ * so that we can re-write the block safely.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+ }
+
+ if (woff >= limit) {
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EFBIG));
+ }
+
+ if ((woff + n) > limit || woff > (limit - n))
+ n = limit - woff;
+
+ /* Will this write extend the file length? */
+ int write_eof = (woff + n > zp->z_size);
+
+ uint64_t end_size = MAX(zp->z_size, woff + n);
+ zilog_t *zilog = zfsvfs->z_log;
+#ifdef HAVE_UIO_ZEROCOPY
+ int i_iov = 0;
+ const iovec_t *iovp = uio->uio_iov;
+ ASSERTV(int iovcnt = uio->uio_iovcnt);
+#endif
+
+
+ /*
+ * Write the file in reasonable size chunks. Each chunk is written
+ * in a separate transaction; this keeps the intent log records small
+ * and allows us to do more fine-grained space accounting.
+ */
+ while (n > 0) {
+ woff = uio->uio_loffset;
+
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
+ KUID_TO_SUID(ip->i_uid)) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ KGID_TO_SGID(ip->i_gid)) ||
+ (zp->z_projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ zp->z_projid))) {
+ error = SET_ERROR(EDQUOT);
+ break;
+ }
+
+ arc_buf_t *abuf = NULL;
+ const iovec_t *aiov = NULL;
+ if (xuio) {
+#ifdef HAVE_UIO_ZEROCOPY
+ ASSERT(i_iov < iovcnt);
+ ASSERT3U(uio->uio_segflg, !=, UIO_BVEC);
+ aiov = &iovp[i_iov];
+ abuf = dmu_xuio_arcbuf(xuio, i_iov);
+ dmu_xuio_clear(xuio, i_iov);
+ ASSERT((aiov->iov_base == abuf->b_data) ||
+ ((char *)aiov->iov_base - (char *)abuf->b_data +
+ aiov->iov_len == arc_buf_size(abuf)));
+ i_iov++;
+#endif
+ } else if (n >= max_blksz && woff >= zp->z_size &&
+ P2PHASE(woff, max_blksz) == 0 &&
+ zp->z_blksz == max_blksz) {
+ /*
+ * This write covers a full block. "Borrow" a buffer
+ * from the dmu so that we can fill it before we enter
+ * a transaction. This avoids the possibility of
+ * holding up the transaction if the data copy hangs
+ * up on a pagefault (e.g., from an NFS server mapping).
+ */
+ size_t cbytes;
+
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ max_blksz);
+ ASSERT(abuf != NULL);
+ ASSERT(arc_buf_size(abuf) == max_blksz);
+ if ((error = uiocopy(abuf->b_data, max_blksz,
+ UIO_WRITE, uio, &cbytes))) {
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+ ASSERT(cbytes == max_blksz);
+ }
+
+ /*
+ * Start a transaction.
+ */
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+ DB_DNODE_ENTER(db);
+ dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+ MIN(n, max_blksz));
+ DB_DNODE_EXIT(db);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ if (abuf != NULL)
+ dmu_return_arcbuf(abuf);
+ break;
+ }
+
+ /*
+ * If rangelock_enter() over-locked we grow the blocksize
+ * and then reduce the lock range. This will only happen
+ * on the first iteration since rangelock_reduce() will
+ * shrink down lr_length to the appropriate size.
+ */
+ if (lr->lr_length == UINT64_MAX) {
+ uint64_t new_blksz;
+
+ if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
+ } else {
+ new_blksz = MIN(end_size, max_blksz);
+ }
+ zfs_grow_blocksize(zp, new_blksz, tx);
+ rangelock_reduce(lr, woff, n);
+ }
+
+ /*
+ * XXX - should we really limit each write to z_max_blksz?
+ * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+ */
+ ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+ ssize_t tx_bytes;
+ if (abuf == NULL) {
+ tx_bytes = uio->uio_resid;
+ uio->uio_fault_disable = B_TRUE;
+ error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes, tx);
+ uio->uio_fault_disable = B_FALSE;
+ if (error == EFAULT) {
+ dmu_tx_commit(tx);
+ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ break;
+ }
+ continue;
+ } else if (error != 0) {
+ dmu_tx_commit(tx);
+ break;
+ }
+ tx_bytes -= uio->uio_resid;
+ } else {
+ tx_bytes = nbytes;
+ ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+ /*
+ * If this is not a full block write, but we are
+ * extending the file past EOF and this data starts
+ * block-aligned, use assign_arcbuf(). Otherwise,
+ * write via dmu_write().
+ */
+ if (tx_bytes < max_blksz && (!write_eof ||
+ aiov->iov_base != abuf->b_data)) {
+ ASSERT(xuio);
+ dmu_write(zfsvfs->z_os, zp->z_id, woff,
+ /* cppcheck-suppress nullPointer */
+ aiov->iov_len, aiov->iov_base, tx);
+ dmu_return_arcbuf(abuf);
+ xuio_stat_wbuf_copied();
+ } else {
+ ASSERT(xuio || tx_bytes == max_blksz);
+ error = dmu_assign_arcbuf_by_dbuf(
+ sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
+ if (error != 0) {
+ dmu_return_arcbuf(abuf);
+ dmu_tx_commit(tx);
+ break;
+ }
+ }
+ ASSERT(tx_bytes <= uio->uio_resid);
+ uioskip(uio, tx_bytes);
+ }
+ if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) {
+ update_pages(ip, woff,
+ tx_bytes, zfsvfs->z_os, zp->z_id);
+ }
+
+ /*
+ * If we made no progress, we're done. If we made even
+ * partial progress, update the znode and ZIL accordingly.
+ */
+ if (tx_bytes == 0) {
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ (void *)&zp->z_size, sizeof (uint64_t), tx);
+ dmu_tx_commit(tx);
+ ASSERT(error != 0);
+ break;
+ }
+
+ /*
+ * Clear Set-UID/Set-GID bits on successful write if not
+ * privileged and at least one of the execute bits is set.
+ *
+ * It would be nice to do this after all writes have
+ * been done, but that would still expose the ISUID/ISGID
+ * to another app after the partial write is committed.
+ *
+ * Note: we don't call zfs_fuid_map_id() here because
+ * user 0 is not an ephemeral uid.
+ */
+ mutex_enter(&zp->z_acl_lock);
+ uint32_t uid = KUID_TO_SUID(ip->i_uid);
+ if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+ (S_IXUSR >> 6))) != 0 &&
+ (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+ secpolicy_vnode_setid_retain(cr,
+ ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+ uint64_t newmode;
+ zp->z_mode &= ~(S_ISUID | S_ISGID);
+ ip->i_mode = newmode = zp->z_mode;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+ (void *)&newmode, sizeof (uint64_t), tx);
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+ /*
+ * Update the file size (zp_size) if it has changed;
+ * account for possible concurrent updates.
+ */
+ while ((end_size = zp->z_size) < uio->uio_loffset) {
+ (void) atomic_cas_64(&zp->z_size, end_size,
+ uio->uio_loffset);
+ ASSERT(error == 0);
+ }
+ /*
+ * If we are replaying and eof is non zero then force
+ * the file size to the specified eof. Note, there's no
+ * concurrency during replay.
+ */
+ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+ zp->z_size = zfsvfs->z_replay_eof;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+ NULL, NULL);
+ dmu_tx_commit(tx);
+
+ if (error != 0)
+ break;
+ ASSERT(tx_bytes == nbytes);
+ n -= nbytes;
+
+ if (!xuio && n > 0) {
+ if (uio_prefaultpages(MIN(n, max_blksz), uio)) {
+ error = EFAULT;
+ break;
+ }
+ }
+ }
+
+ zfs_inode_update(zp);
+ rangelock_exit(lr);
+
+ /*
+ * If we're in replay mode, or we made no progress, return error.
+ * Otherwise, it's at least a partial write, so it's successful.
+ */
+ if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (ioflag & (FSYNC | FDSYNC) ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, zp->z_id);
+
+ int64_t nwritten = start_resid - uio->uio_resid;
+ dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+ task_io_account_write(nwritten);
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Drop a reference on the passed inode asynchronously. This ensures
+ * that the caller will never drop the last reference on an inode in
+ * the current context. Doing so while holding open a tx could result
+ * in a deadlock if iput_final() re-enters the filesystem code.
+ */
+void
+zfs_iput_async(struct inode *ip)
+{
+ objset_t *os = ITOZSB(ip)->z_os;
+
+ ASSERT(atomic_read(&ip->i_count) > 0);
+ ASSERT(os != NULL);
+
+ if (atomic_read(&ip->i_count) == 1)
+ VERIFY(taskq_dispatch(dsl_pool_iput_taskq(dmu_objset_pool(os)),
+ (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+ else
+ iput(ip);
+}
+
+/* ARGSUSED */
+void
+zfs_get_done(zgd_t *zgd, int error)
+{
+ znode_t *zp = zgd->zgd_private;
+
+ if (zgd->zgd_db)
+ dmu_buf_rele(zgd->zgd_db, zgd);
+
+ rangelock_exit(zgd->zgd_lr);
+
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_iput_async(ZTOI(zp));
+
+ kmem_free(zgd, sizeof (zgd_t));
+}
+
+#ifdef DEBUG
+static int zil_fault_io = 0;
+#endif
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+ zfsvfs_t *zfsvfs = arg;
+ objset_t *os = zfsvfs->z_os;
+ znode_t *zp;
+ uint64_t object = lr->lr_foid;
+ uint64_t offset = lr->lr_offset;
+ uint64_t size = lr->lr_length;
+ dmu_buf_t *db;
+ zgd_t *zgd;
+ int error = 0;
+
+ ASSERT3P(lwb, !=, NULL);
+ ASSERT3P(zio, !=, NULL);
+ ASSERT3U(size, !=, 0);
+
+ /*
+ * Nothing to do if the file has been removed
+ */
+ if (zfs_zget(zfsvfs, object, &zp) != 0)
+ return (SET_ERROR(ENOENT));
+ if (zp->z_unlinked) {
+ /*
+ * Release the vnode asynchronously as we currently have the
+ * txg stopped from syncing.
+ */
+ zfs_iput_async(ZTOI(zp));
+ return (SET_ERROR(ENOENT));
+ }
+
+ zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+ zgd->zgd_lwb = lwb;
+ zgd->zgd_private = zp;
+
+ /*
+ * Write records come in two flavors: immediate and indirect.
+ * For small writes it's cheaper to store the data with the
+ * log record (immediate); for large writes it's cheaper to
+ * sync the data and get a pointer to it (indirect) so that
+ * we don't have to write the data twice.
+ */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ /* test for truncation needs to be done while range locked */
+ if (offset >= zp->z_size) {
+ error = SET_ERROR(ENOENT);
+ } else {
+ error = dmu_read(os, object, offset, size, buf,
+ DMU_READ_NO_PREFETCH);
+ }
+ ASSERT(error == 0 || error == ENOENT);
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's
+ * written out and its checksum is being calculated
+ * that no one can change the data. We need to re-check
+ * blocksize after we get the lock in case it's changed!
+ */
+ for (;;) {
+ uint64_t blkoff;
+ size = zp->z_blksz;
+ blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+ offset -= blkoff;
+ zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
+ offset, size, RL_READER);
+ if (zp->z_blksz == size)
+ break;
+ offset += blkoff;
+ rangelock_exit(zgd->zgd_lr);
+ }
+ /* test for truncation needs to be done while range locked */
+ if (lr->lr_offset >= zp->z_size)
+ error = SET_ERROR(ENOENT);
+#ifdef DEBUG
+ if (zil_fault_io) {
+ error = SET_ERROR(EIO);
+ zil_fault_io = 0;
+ }
+#endif
+ if (error == 0)
+ error = dmu_buf_hold(os, object, offset, zgd, &db,
+ DMU_READ_NO_PREFETCH);
+
+ if (error == 0) {
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ zgd->zgd_db = db;
+ zgd->zgd_bp = bp;
+
+ ASSERT(db->db_offset == offset);
+ ASSERT(db->db_size == size);
+
+ error = dmu_sync(zio, lr->lr_common.lrc_txg,
+ zfs_get_done, zgd);
+ ASSERT(error || lr->lr_length <= size);
+
+ /*
+ * On success, we need to wait for the write I/O
+ * initiated by dmu_sync() to complete before we can
+ * release this dbuf. We will finish everything up
+ * in the zfs_get_done() callback.
+ */
+ if (error == 0)
+ return (0);
+
+ if (error == EALREADY) {
+ lr->lr_common.lrc_txtype = TX_WRITE2;
+ /*
+ * TX_WRITE2 relies on the data previously
+ * written by the TX_WRITE that caused
+ * EALREADY. We zero out the BP because
+ * it is the old, currently-on-disk BP.
+ */
+ zgd->zgd_bp = NULL;
+ BP_ZERO(bp);
+ error = 0;
+ }
+ }
+ }
+
+ zfs_get_done(zgd, error);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_access(struct inode *ip, int mode, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (flag & V_ACE_MASK)
+ error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+ else
+ error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held inode reference for it.
+ *
+ * IN: dip - inode of directory to search.
+ * nm - name of entry to lookup.
+ * flags - LOOKUP_XATTR set if looking for an attribute.
+ * cr - credentials of caller.
+ * direntflags - directory lookup flags
+ * realpnp - returned pathname.
+ *
+ * OUT: ipp - inode of located entry, NULL if not found.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * NA
+ */
+/* ARGSUSED */
+int
+zfs_lookup(struct inode *dip, char *nm, struct inode **ipp, int flags,
+ cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+ znode_t *zdp = ITOZ(dip);
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ int error = 0;
+
+ /*
+ * Fast path lookup, however we must skip DNLC lookup
+ * for case folding or normalizing lookups because the
+ * DNLC code only stores the passed in name. This means
+ * creating 'a' and removing 'A' on a case insensitive
+ * file system would work, but DNLC still thinks 'a'
+ * exists and won't let you create it again on the next
+ * pass through fast path.
+ */
+ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+ if (!S_ISDIR(dip->i_mode)) {
+ return (SET_ERROR(ENOTDIR));
+ } else if (zdp->z_sa_hdl == NULL) {
+ return (SET_ERROR(EIO));
+ }
+
+ if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+ error = zfs_fastaccesschk_execute(zdp, cr);
+ if (!error) {
+ *ipp = dip;
+ igrab(*ipp);
+ return (0);
+ }
+ return (error);
+ }
+ }
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zdp);
+
+ *ipp = NULL;
+
+ if (flags & LOOKUP_XATTR) {
+ /*
+ * We don't allow recursive attributes..
+ * Maybe someday we will.
+ */
+ if (zdp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if ((error = zfs_get_xattrdir(zdp, ipp, cr, flags))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Do we have permission to get into attribute directory?
+ */
+
+ if ((error = zfs_zaccess(ITOZ(*ipp), ACE_EXECUTE, 0,
+ B_FALSE, cr))) {
+ iput(*ipp);
+ *ipp = NULL;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (!S_ISDIR(dip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTDIR));
+ }
+
+ /*
+ * Check accessibility of directory.
+ */
+
+ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ error = zfs_dirlook(zdp, nm, ipp, flags, direntflags, realpnp);
+ if ((error == 0) && (*ipp))
+ zfs_inode_update(ITOZ(*ipp));
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory. If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error. Return the ip of the created or trunc'd file.
+ *
+ * IN: dip - inode of directory to put new file entry in.
+ * name - name of new file entry.
+ * vap - attributes of new file.
+ * excl - flag indicating exclusive or non-exclusive mode.
+ * mode - mode to open file with.
+ * cr - credentials of caller.
+ * flag - file flag.
+ * vsecp - ACL to be set
+ *
+ * OUT: ipp - inode of created or trunc'd entry.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dip - ctime|mtime updated if new entry created
+ * ip - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl,
+ int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+ znode_t *zp, *dzp = ITOZ(dip);
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ zilog_t *zilog;
+ objset_t *os;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ uid_t uid;
+ gid_t gid;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ boolean_t have_acl = B_FALSE;
+ boolean_t waited = B_FALSE;
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ gid = crgetgid(cr);
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (vap->va_mask & ATTR_XVATTR) {
+ if ((error = secpolicy_xvattr((xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_mode)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+top:
+ *ipp = NULL;
+ if (*name == '\0') {
+ /*
+ * Null component name refers to the directory itself.
+ */
+ igrab(dip);
+ zp = dzp;
+ dl = NULL;
+ error = 0;
+ } else {
+ /* possible igrab(zp) */
+ int zflg = 0;
+
+ if (flag & FIGNORECASE)
+ zflg |= ZCILOOK;
+
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+ NULL, NULL);
+ if (error) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ if (strcmp(name, "..") == 0)
+ error = SET_ERROR(EISDIR);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if (zp == NULL) {
+ uint64_t txtype;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ goto out;
+ }
+
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
+
+ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+ have_acl = B_TRUE;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+ projid = zfs_inherit_projid(dzp);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ error = dmu_tx_assign(tx,
+ (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ /*
+ * Since, we failed to add the directory entry for it,
+ * delete the newly created dnode.
+ */
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ if (flag & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+ } else {
+ int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ have_acl = B_FALSE;
+
+ /*
+ * A directory entry already exists for this name.
+ */
+ /*
+ * Can't truncate an existing file if in exclusive mode.
+ */
+ if (excl) {
+ error = SET_ERROR(EEXIST);
+ goto out;
+ }
+ /*
+ * Can't open a directory for writing.
+ */
+ if (S_ISDIR(ZTOI(zp)->i_mode)) {
+ error = SET_ERROR(EISDIR);
+ goto out;
+ }
+ /*
+ * Verify requested access to file.
+ */
+ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
+ goto out;
+ }
+
+ mutex_enter(&dzp->z_lock);
+ dzp->z_seq++;
+ mutex_exit(&dzp->z_lock);
+
+ /*
+ * Truncate regular files if requested.
+ */
+ if (S_ISREG(ZTOI(zp)->i_mode) &&
+ (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
+ /* we can't hold any locks when calling zfs_freesp() */
+ if (dl) {
+ zfs_dirent_unlock(dl);
+ dl = NULL;
+ }
+ error = zfs_freesp(zp, 0, 0, mode, TRUE);
+ }
+ }
+out:
+
+ if (dl)
+ zfs_dirent_unlock(dl);
+
+ if (error) {
+ if (zp)
+ iput(ZTOI(zp));
+ } else {
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ *ipp = ZTOI(zp);
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/* ARGSUSED */
+int
+zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
+ int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+ znode_t *zp = NULL, *dzp = ITOZ(dip);
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ objset_t *os;
+ dmu_tx_t *tx;
+ int error;
+ uid_t uid;
+ gid_t gid;
+ zfs_acl_ids_t acl_ids;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ boolean_t fuid_dirtied;
+ boolean_t have_acl = B_FALSE;
+ boolean_t waited = B_FALSE;
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ gid = crgetgid(cr);
+ uid = crgetuid(cr);
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ os = zfsvfs->z_os;
+
+ if (vap->va_mask & ATTR_XVATTR) {
+ if ((error = secpolicy_xvattr((xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_mode)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+top:
+ *ipp = NULL;
+
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ if (have_acl)
+ zfs_acl_ids_free(&acl_ids);
+ goto out;
+ }
+
+ if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
+ have_acl = B_TRUE;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+ projid = zfs_inherit_projid(dzp);
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
+
+ tx = dmu_tx_create(os);
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ /* Add to unlinked set */
+ zp->z_unlinked = B_TRUE;
+ zfs_unlinked_add(zp, tx);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
+out:
+
+ if (error) {
+ if (zp)
+ iput(ZTOI(zp));
+ } else {
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ *ipp = ZTOI(zp);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ * IN: dip - inode of directory to remove entry from.
+ * name - name of entry to remove.
+ * cr - credentials of caller.
+ * flags - case flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dip - ctime|mtime
+ * ip - ctime (if nlink > 0)
+ */
+
+uint64_t null_xattr = 0;
+
+/*ARGSUSED*/
+int
+zfs_remove(struct inode *dip, char *name, cred_t *cr, int flags)
+{
+ znode_t *zp, *dzp = ITOZ(dip);
+ znode_t *xzp;
+ struct inode *ip;
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ zilog_t *zilog;
+ uint64_t acl_obj, xattr_obj;
+ uint64_t xattr_obj_unlinked = 0;
+ uint64_t obj = 0;
+ uint64_t links;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ boolean_t may_delete_now, delete_now = FALSE;
+ boolean_t unlinked, toobig = FALSE;
+ uint64_t txtype;
+ pathname_t *realnmp = NULL;
+ pathname_t realnm;
+ int error;
+ int zflg = ZEXISTS;
+ boolean_t waited = B_FALSE;
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (flags & FIGNORECASE) {
+ zflg |= ZCILOOK;
+ pn_alloc(&realnm);
+ realnmp = &realnm;
+ }
+
+top:
+ xattr_obj = 0;
+ xzp = NULL;
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+ NULL, realnmp))) {
+ if (realnmp)
+ pn_free(realnmp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ ip = ZTOI(zp);
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ /*
+ * Need to use rmdir for removing directories.
+ */
+ if (S_ISDIR(ip->i_mode)) {
+ error = SET_ERROR(EPERM);
+ goto out;
+ }
+
+ mutex_enter(&zp->z_lock);
+ may_delete_now = atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped);
+ mutex_exit(&zp->z_lock);
+
+ /*
+ * We may delete the znode now, or we may put it in the unlinked set;
+ * it depends on whether we're the last link, and on whether there are
+ * other holds on the inode. So we dmu_tx_hold() the right things to
+ * allow for either case.
+ */
+ obj = zp->z_id;
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ if (may_delete_now) {
+ toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
+ /* if the file is too big, only hold_free a token amount */
+ dmu_tx_hold_free(tx, zp->z_id, 0,
+ (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+ }
+
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+ }
+
+ mutex_enter(&zp->z_lock);
+ if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
+ dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+ mutex_exit(&zp->z_lock);
+
+ /* charge as an update -- would be nice not to charge at all */
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ /*
+ * Mark this transaction as typically resulting in a net free of space
+ */
+ dmu_tx_mark_netfree(tx);
+
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ iput(ip);
+ if (xzp)
+ iput(ZTOI(xzp));
+ goto top;
+ }
+ if (realnmp)
+ pn_free(realnmp);
+ dmu_tx_abort(tx);
+ iput(ip);
+ if (xzp)
+ iput(ZTOI(xzp));
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Remove the directory entry.
+ */
+ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+
+ if (error) {
+ dmu_tx_commit(tx);
+ goto out;
+ }
+
+ if (unlinked) {
+ /*
+ * Hold z_lock so that we can make sure that the ACL obj
+ * hasn't changed. Could have been deleted due to
+ * zfs_sa_upgrade().
+ */
+ mutex_enter(&zp->z_lock);
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
+ delete_now = may_delete_now && !toobig &&
+ atomic_read(&ip->i_count) == 1 && !(zp->z_is_mapped) &&
+ xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
+ acl_obj;
+ }
+
+ if (delete_now) {
+ if (xattr_obj_unlinked) {
+ ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
+ mutex_enter(&xzp->z_lock);
+ xzp->z_unlinked = B_TRUE;
+ clear_nlink(ZTOI(xzp));
+ links = 0;
+ error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+ &links, sizeof (links), tx);
+ ASSERT3U(error, ==, 0);
+ mutex_exit(&xzp->z_lock);
+ zfs_unlinked_add(xzp, tx);
+
+ if (zp->z_is_sa)
+ error = sa_remove(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), tx);
+ else
+ error = sa_update(zp->z_sa_hdl,
+ SA_ZPL_XATTR(zfsvfs), &null_xattr,
+ sizeof (uint64_t), tx);
+ ASSERT0(error);
+ }
+ /*
+ * Add to the unlinked set because a new reference could be
+ * taken concurrently resulting in a deferred destruction.
+ */
+ zfs_unlinked_add(zp, tx);
+ mutex_exit(&zp->z_lock);
+ } else if (unlinked) {
+ mutex_exit(&zp->z_lock);
+ zfs_unlinked_add(zp, tx);
+ }
+
+ txtype = TX_REMOVE;
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+ dmu_tx_commit(tx);
+out:
+ if (realnmp)
+ pn_free(realnmp);
+
+ zfs_dirent_unlock(dl);
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+
+ if (delete_now)
+ iput(ip);
+ else
+ zfs_iput_async(ip);
+
+ if (xzp) {
+ zfs_inode_update(xzp);
+ zfs_iput_async(ZTOI(xzp));
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Create a new directory and insert it into dip using the name
+ * provided. Return a pointer to the inserted directory.
+ *
+ * IN: dip - inode of directory to add subdir to.
+ * dirname - name of new directory.
+ * vap - attributes of new directory.
+ * cr - credentials of caller.
+ * flags - case flags.
+ * vsecp - ACL to be set
+ *
+ * OUT: ipp - inode of created directory.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * dip - ctime|mtime updated
+ * ipp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp,
+ cred_t *cr, int flags, vsecattr_t *vsecp)
+{
+ znode_t *zp, *dzp = ITOZ(dip);
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ zilog_t *zilog;
+ zfs_dirlock_t *dl;
+ uint64_t txtype;
+ dmu_tx_t *tx;
+ int error;
+ int zf = ZNEW;
+ uid_t uid;
+ gid_t gid = crgetgid(cr);
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ boolean_t waited = B_FALSE;
+
+ ASSERT(S_ISDIR(vap->va_mode));
+
+ /*
+ * If we have an ephemeral id, ACL, or XVATTR then
+ * make sure file system is at proper version
+ */
+
+ uid = crgetuid(cr);
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+ return (SET_ERROR(EINVAL));
+
+ if (dirname == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (dzp->z_pflags & ZFS_XATTR) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(dirname,
+ strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+ if (flags & FIGNORECASE)
+ zf |= ZCILOOK;
+
+ if (vap->va_mask & ATTR_XVATTR) {
+ if ((error = secpolicy_xvattr((xvattr_t *)vap,
+ crgetuid(cr), cr, vap->va_mode)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+ vsecp, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ /*
+ * First make sure the new directory doesn't exist.
+ *
+ * Existence is checked first to make sure we don't return
+ * EACCES instead of EEXIST which can cause some applications
+ * to fail.
+ */
+top:
+ *ipp = NULL;
+
+ if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
+ NULL, NULL))) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+
+ /*
+ * Add a new entry to the directory.
+ */
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
+
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create new node.
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ /*
+ * Now put new name in parent dir.
+ */
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ goto out;
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ *ipp = ZTOI(zp);
+
+ txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+ acl_ids.z_fuidp, vap);
+
+out:
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (error != 0) {
+ iput(ZTOI(zp));
+ } else {
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Remove a directory subdir entry. If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ * IN: dip - inode of directory to remove from.
+ * name - name of directory to be removed.
+ * cwd - inode of current working directory.
+ * cr - credentials of caller.
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dip - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rmdir(struct inode *dip, char *name, struct inode *cwd, cred_t *cr,
+ int flags)
+{
+ znode_t *dzp = ITOZ(dip);
+ znode_t *zp;
+ struct inode *ip;
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ zilog_t *zilog;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ int zflg = ZEXISTS;
+ boolean_t waited = B_FALSE;
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (flags & FIGNORECASE)
+ zflg |= ZCILOOK;
+top:
+ zp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry doesn't exist.
+ */
+ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+ NULL, NULL))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ ip = ZTOI(zp);
+
+ if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+ goto out;
+ }
+
+ if (!S_ISDIR(ip->i_mode)) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+
+ if (ip == cwd) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ /*
+ * Grab a lock on the directory to make sure that no one is
+ * trying to add (or lookup) entries while we are removing it.
+ */
+ rw_enter(&zp->z_name_lock, RW_WRITER);
+
+ /*
+ * Grab a lock on the parent pointer to make sure we play well
+ * with the treewalk and directory rename code.
+ */
+ rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ zfs_sa_upgrade_txholds(tx, zp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ iput(ip);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ iput(ip);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+
+ if (error == 0) {
+ uint64_t txtype = TX_RMDIR;
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
+ B_FALSE);
+ }
+
+ dmu_tx_commit(tx);
+
+ rw_exit(&zp->z_parent_lock);
+ rw_exit(&zp->z_name_lock);
+out:
+ zfs_dirent_unlock(dl);
+
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ iput(ip);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Read directory entries from the given directory cursor position and emit
+ * name and position for each entry.
+ *
+ * IN: ip - inode of directory to read.
+ * ctx - directory entry context.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+int
+zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ objset_t *os;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ int error;
+ uint8_t prefetch;
+ uint8_t type;
+ int done = 0;
+ uint64_t parent;
+ uint64_t offset; /* must be unsigned; checks for < 1 */
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent))) != 0)
+ goto out;
+
+ /*
+ * Quit if directory has been removed (posix)
+ */
+ if (zp->z_unlinked)
+ goto out;
+
+ error = 0;
+ os = zfsvfs->z_os;
+ offset = ctx->pos;
+ prefetch = zp->z_zn_prefetch;
+
+ /*
+ * Initialize the iterator cursor.
+ */
+ if (offset <= 3) {
+ /*
+ * Start iteration from the beginning of the directory.
+ */
+ zap_cursor_init(&zc, os, zp->z_id);
+ } else {
+ /*
+ * The offset is a serialized cursor.
+ */
+ zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+ }
+
+ /*
+ * Transform to file-system independent format
+ */
+ while (!done) {
+ uint64_t objnum;
+ /*
+ * Special case `.', `..', and `.zfs'.
+ */
+ if (offset == 0) {
+ (void) strcpy(zap.za_name, ".");
+ zap.za_normalization_conflict = 0;
+ objnum = zp->z_id;
+ type = DT_DIR;
+ } else if (offset == 1) {
+ (void) strcpy(zap.za_name, "..");
+ zap.za_normalization_conflict = 0;
+ objnum = parent;
+ type = DT_DIR;
+ } else if (offset == 2 && zfs_show_ctldir(zp)) {
+ (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+ zap.za_normalization_conflict = 0;
+ objnum = ZFSCTL_INO_ROOT;
+ type = DT_DIR;
+ } else {
+ /*
+ * Grab next entry.
+ */
+ if ((error = zap_cursor_retrieve(&zc, &zap))) {
+ if (error == ENOENT)
+ break;
+ else
+ goto update;
+ }
+
+ /*
+ * Allow multiple entries provided the first entry is
+ * the object id. Non-zpl consumers may safely make
+ * use of the additional space.
+ *
+ * XXX: This should be a feature flag for compatibility
+ */
+ if (zap.za_integer_length != 8 ||
+ zap.za_num_integers == 0) {
+ cmn_err(CE_WARN, "zap_readdir: bad directory "
+ "entry, obj = %lld, offset = %lld, "
+ "length = %d, num = %lld\n",
+ (u_longlong_t)zp->z_id,
+ (u_longlong_t)offset,
+ zap.za_integer_length,
+ (u_longlong_t)zap.za_num_integers);
+ error = SET_ERROR(ENXIO);
+ goto update;
+ }
+
+ objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+ type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+ }
+
+ done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
+ objnum, type);
+ if (done)
+ break;
+
+ /* Prefetch znode */
+ if (prefetch) {
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+
+ /*
+ * Move to the next entry, fill in the previous offset.
+ */
+ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+ zap_cursor_advance(&zc);
+ offset = zap_cursor_serialize(&zc);
+ } else {
+ offset += 1;
+ }
+ ctx->pos = offset;
+ }
+ zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+update:
+ zap_cursor_fini(&zc);
+ if (error == ENOENT)
+ error = 0;
+out:
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+ulong_t zfs_fsync_sync_cnt = 4;
+
+int
+zfs_fsync(struct inode *ip, int syncflag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+ if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ }
+ tsd_set(zfs_fsyncer_key, NULL);
+
+ return (0);
+}
+
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ * IN: ip - inode of file.
+ * vap - va_mask identifies requested attributes.
+ * If ATTR_XVATTR set, then optional attrs are requested
+ * flags - ATTR_NOACLCHECK (CIFS server context)
+ * cr - credentials of caller.
+ *
+ * OUT: vap - attribute values.
+ *
+ * RETURN: 0 (always succeeds)
+ */
+/* ARGSUSED */
+int
+zfs_getattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error = 0;
+ uint64_t links;
+ uint64_t atime[2], mtime[2], ctime[2];
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap = NULL;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+ if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+ * Also, if we are the owner don't bother, since owner should
+ * always be allowed to read basic attributes of file.
+ */
+ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+ (vap->va_uid != crgetuid(cr))) {
+ if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+ skipaclchk, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ }
+
+ /*
+ * Return all attributes. It's cheaper to provide the answer
+ * than to determine whether we were asked the question.
+ */
+
+ mutex_enter(&zp->z_lock);
+ vap->va_type = vn_mode_to_vtype(zp->z_mode);
+ vap->va_mode = zp->z_mode;
+ vap->va_fsid = ZTOI(zp)->i_sb->s_dev;
+ vap->va_nodeid = zp->z_id;
+ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
+ links = ZTOI(zp)->i_nlink + 1;
+ else
+ links = ZTOI(zp)->i_nlink;
+ vap->va_nlink = MIN(links, ZFS_LINK_MAX);
+ vap->va_size = i_size_read(ip);
+ vap->va_rdev = ip->i_rdev;
+ vap->va_seq = ip->i_generation;
+
+ /*
+ * Add in any requested optional attributes and the create time.
+ * Also set the corresponding bits in the returned attribute bitmap.
+ */
+ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ xoap->xoa_archive =
+ ((zp->z_pflags & ZFS_ARCHIVE) != 0);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ xoap->xoa_readonly =
+ ((zp->z_pflags & ZFS_READONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ xoap->xoa_system =
+ ((zp->z_pflags & ZFS_SYSTEM) != 0);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ xoap->xoa_hidden =
+ ((zp->z_pflags & ZFS_HIDDEN) != 0);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ xoap->xoa_nounlink =
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ xoap->xoa_immutable =
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ xoap->xoa_appendonly =
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ xoap->xoa_nodump =
+ ((zp->z_pflags & ZFS_NODUMP) != 0);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ xoap->xoa_opaque =
+ ((zp->z_pflags & ZFS_OPAQUE) != 0);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ xoap->xoa_av_quarantined =
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ xoap->xoa_av_modified =
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+ S_ISREG(ip->i_mode)) {
+ zfs_sa_get_scanstamp(zp, xvap);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ uint64_t times[2];
+
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+ times, sizeof (times));
+ ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+ xoap->xoa_generation = ip->i_generation;
+ XVA_SET_RTN(xvap, XAT_GEN);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ xoap->xoa_offline =
+ ((zp->z_pflags & ZFS_OFFLINE) != 0);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ xoap->xoa_sparse =
+ ((zp->z_pflags & ZFS_SPARSE) != 0);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ xoap->xoa_projinherit =
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
+ XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ xoap->xoa_projid = zp->z_projid;
+ XVA_SET_RTN(xvap, XAT_PROJID);
+ }
+ }
+
+ ZFS_TIME_DECODE(&vap->va_atime, atime);
+ ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+ ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+
+ mutex_exit(&zp->z_lock);
+
+ sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
+
+ if (zp->z_blksz == 0) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ vap->va_blksize = zfsvfs->z_max_blksz;
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * Get the basic file attributes and place them in the provided kstat
+ * structure. The inode is assumed to be the authoritative source
+ * for most of the attributes. However, the znode currently has the
+ * authoritative atime, blksize, and block count.
+ *
+ * IN: ip - inode of file.
+ *
+ * OUT: sp - kstat values.
+ *
+ * RETURN: 0 (always succeeds)
+ */
+/* ARGSUSED */
+int
+zfs_getattr_fast(struct inode *ip, struct kstat *sp)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint32_t blksize;
+ u_longlong_t nblocks;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ mutex_enter(&zp->z_lock);
+
+ generic_fillattr(ip, sp);
+ /*
+ * +1 link count for root inode with visible '.zfs' directory.
+ */
+ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
+ if (sp->nlink < ZFS_LINK_MAX)
+ sp->nlink++;
+
+ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+ sp->blksize = blksize;
+ sp->blocks = nblocks;
+
+ if (unlikely(zp->z_blksz == 0)) {
+ /*
+ * Block size hasn't been set; suggest maximal I/O transfers.
+ */
+ sp->blksize = zfsvfs->z_max_blksz;
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ /*
+ * Required to prevent NFS client from detecting different inode
+ * numbers of snapshot root dentry before and after snapshot mount.
+ */
+ if (zfsvfs->z_issnap) {
+ if (ip->i_sb->s_root->d_inode == ip)
+ sp->ino = ZFSCTL_INO_SNAPDIRS -
+ dmu_objset_id(zfsvfs->z_os);
+ }
+
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+
+/*
+ * For the operation of changing file's user/group/project, we need to
+ * handle not only the main object that is assigned to the file directly,
+ * but also the ones that are used by the file via hidden xattr directory.
+ *
+ * Because the xattr directory may contains many EA entries, as to it may
+ * be impossible to change all of them via the transaction of changing the
+ * main object's user/group/project attributes. Then we have to change them
+ * via other multiple independent transactions one by one. It may be not good
+ * solution, but we have no better idea yet.
+ */
+static int
+zfs_setattr_dir(znode_t *dzp)
+{
+ struct inode *dxip = ZTOI(dzp);
+ struct inode *xip = NULL;
+ zfsvfs_t *zfsvfs = ITOZSB(dxip);
+ objset_t *os = zfsvfs->z_os;
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ zfs_dirlock_t *dl;
+ znode_t *zp;
+ dmu_tx_t *tx = NULL;
+ uint64_t uid, gid;
+ sa_bulk_attr_t bulk[4];
+ int count;
+ int err;
+
+ zap_cursor_init(&zc, os, dzp->z_id);
+ while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
+ count = 0;
+ if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+ err = ENXIO;
+ break;
+ }
+
+ err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
+ ZEXISTS, NULL, NULL);
+ if (err == ENOENT)
+ goto next;
+ if (err)
+ break;
+
+ xip = ZTOI(zp);
+ if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
+ KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
+ zp->z_projid == dzp->z_projid)
+ goto next;
+
+ tx = dmu_tx_create(os);
+ if (!(zp->z_pflags & ZFS_PROJID))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ break;
+
+ mutex_enter(&dzp->z_lock);
+
+ if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
+ xip->i_uid = dxip->i_uid;
+ uid = zfs_uid_read(dxip);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &uid, sizeof (uid));
+ }
+
+ if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
+ xip->i_gid = dxip->i_gid;
+ gid = zfs_gid_read(dxip);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &gid, sizeof (gid));
+ }
+
+ if (zp->z_projid != dzp->z_projid) {
+ if (!(zp->z_pflags & ZFS_PROJID)) {
+ zp->z_pflags |= ZFS_PROJID;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
+ sizeof (zp->z_pflags));
+ }
+
+ zp->z_projid = dzp->z_projid;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
+ NULL, &zp->z_projid, sizeof (zp->z_projid));
+ }
+
+ mutex_exit(&dzp->z_lock);
+
+ if (likely(count > 0)) {
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ tx = NULL;
+ if (err != 0 && err != ENOENT)
+ break;
+
+next:
+ if (xip) {
+ iput(xip);
+ xip = NULL;
+ zfs_dirent_unlock(dl);
+ }
+ zap_cursor_advance(&zc);
+ }
+
+ if (tx)
+ dmu_tx_abort(tx);
+ if (xip) {
+ iput(xip);
+ zfs_dirent_unlock(dl);
+ }
+ zap_cursor_fini(&zc);
+
+ return (err == ENOENT ? 0 : err);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ * IN: ip - inode of file to be modified.
+ * vap - new attribute values.
+ * If ATTR_XVATTR set, then optional attrs are being set
+ * flags - ATTR_UTIME set if non-default time values provided.
+ * - ATTR_NOACLCHECK (CIFS context only).
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(struct inode *ip, vattr_t *vap, int flags, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ objset_t *os = zfsvfs->z_os;
+ zilog_t *zilog;
+ dmu_tx_t *tx;
+ vattr_t oldva;
+ xvattr_t *tmpxvattr;
+ uint_t mask = vap->va_mask;
+ uint_t saved_mask = 0;
+ int trim_mask = 0;
+ uint64_t new_mode;
+ uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid;
+ uint64_t xattr_obj;
+ uint64_t mtime[2], ctime[2], atime[2];
+ uint64_t projid = ZFS_INVALID_PROJID;
+ znode_t *attrzp;
+ int need_policy = FALSE;
+ int err, err2 = 0;
+ zfs_fuid_info_t *fuidp = NULL;
+ xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
+ xoptattr_t *xoap;
+ zfs_acl_t *aclp;
+ boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ boolean_t fuid_dirtied = B_FALSE;
+ boolean_t handle_eadir = B_FALSE;
+ sa_bulk_attr_t *bulk, *xattr_bulk;
+ int count = 0, xattr_count = 0, bulks = 8;
+
+ if (mask == 0)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * If this is a xvattr_t, then get a pointer to the structure of
+ * optional attributes. If this is NULL, then we have a vattr_t.
+ */
+ xoap = xva_getxoptattr(xvap);
+ if (xoap != NULL && (mask & ATTR_XVATTR)) {
+ if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+ if (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ projid = xoap->xoa_projid;
+ if (unlikely(projid == ZFS_INVALID_PROJID)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+ projid = ZFS_INVALID_PROJID;
+ else
+ need_policy = TRUE;
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+ (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+ (!dmu_objset_projectquota_enabled(os) ||
+ (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENOTSUP));
+ }
+ }
+
+ zilog = zfsvfs->z_log;
+
+ /*
+ * Make sure that if we have ephemeral uid/gid or xvattr specified
+ * that file system is at proper version level
+ */
+
+ if (zfsvfs->z_use_fuids == B_FALSE &&
+ (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+ ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+ (mask & ATTR_XVATTR))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EISDIR));
+ }
+
+ if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
+ xva_init(tmpxvattr);
+
+ bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+ xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+
+ /*
+ * Immutable files can only alter immutable bit and atime
+ */
+ if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+ ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
+ ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+ err = SET_ERROR(EPERM);
+ goto out3;
+ }
+
+ if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
+ err = SET_ERROR(EPERM);
+ goto out3;
+ }
+
+ /*
+ * Verify timestamps doesn't overflow 32 bits.
+ * ZFS can handle large timestamps, but 32bit syscalls can't
+ * handle times greater than 2039. This check should be removed
+ * once large timestamps are fully supported.
+ */
+ if (mask & (ATTR_ATIME | ATTR_MTIME)) {
+ if (((mask & ATTR_ATIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+ ((mask & ATTR_MTIME) &&
+ TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+ err = SET_ERROR(EOVERFLOW);
+ goto out3;
+ }
+ }
+
+top:
+ attrzp = NULL;
+ aclp = NULL;
+
+ /* Can this be moved to before the top label? */
+ if (zfs_is_readonly(zfsvfs)) {
+ err = SET_ERROR(EROFS);
+ goto out3;
+ }
+
+ /*
+ * First validate permissions
+ */
+
+ if (mask & ATTR_SIZE) {
+ err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
+ if (err)
+ goto out3;
+
+ /*
+ * XXX - Note, we are not providing any open
+ * mode flags here (like FNDELAY), so we may
+ * block if there are locks present... this
+ * should be addressed in openat().
+ */
+ /* XXX - would it be OK to generate a log record here? */
+ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+ if (err)
+ goto out3;
+ }
+
+ if (mask & (ATTR_ATIME|ATTR_MTIME) ||
+ ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+ XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+ XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+ XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+ XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+ XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+ XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+ need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+ skipaclchk, cr);
+ }
+
+ if (mask & (ATTR_UID|ATTR_GID)) {
+ int idmask = (mask & (ATTR_UID|ATTR_GID));
+ int take_owner;
+ int take_group;
+
+ /*
+ * NOTE: even if a new mode is being set,
+ * we may clear S_ISUID/S_ISGID bits.
+ */
+
+ if (!(mask & ATTR_MODE))
+ vap->va_mode = zp->z_mode;
+
+ /*
+ * Take ownership or chgrp to group we are a member of
+ */
+
+ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
+ take_group = (mask & ATTR_GID) &&
+ zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+ /*
+ * If both ATTR_UID and ATTR_GID are set then take_owner and
+ * take_group must both be set in order to allow taking
+ * ownership.
+ *
+ * Otherwise, send the check through secpolicy_vnode_setattr()
+ *
+ */
+
+ if (((idmask == (ATTR_UID|ATTR_GID)) &&
+ take_owner && take_group) ||
+ ((idmask == ATTR_UID) && take_owner) ||
+ ((idmask == ATTR_GID) && take_group)) {
+ if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+ skipaclchk, cr) == 0) {
+ /*
+ * Remove setuid/setgid for non-privileged users
+ */
+ (void) secpolicy_setid_clear(vap, cr);
+ trim_mask = (mask & (ATTR_UID|ATTR_GID));
+ } else {
+ need_policy = TRUE;
+ }
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ mutex_enter(&zp->z_lock);
+ oldva.va_mode = zp->z_mode;
+ zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+ if (mask & ATTR_XVATTR) {
+ /*
+ * Update xvattr mask to include only those attributes
+ * that are actually changing.
+ *
+ * the bits will be restored prior to actually setting
+ * the attributes so the caller thinks they were set.
+ */
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ if (xoap->xoa_appendonly !=
+ ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+ XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ if (xoap->xoa_projinherit !=
+ ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+ XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ if (xoap->xoa_nounlink !=
+ ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+ XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ if (xoap->xoa_immutable !=
+ ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+ XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ if (xoap->xoa_nodump !=
+ ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_NODUMP);
+ XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ if (xoap->xoa_av_modified !=
+ ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+ XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ if ((!S_ISREG(ip->i_mode) &&
+ xoap->xoa_av_quarantined) ||
+ xoap->xoa_av_quarantined !=
+ ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+ need_policy = TRUE;
+ } else {
+ XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+ XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
+ }
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ mutex_exit(&zp->z_lock);
+ err = SET_ERROR(EPERM);
+ goto out3;
+ }
+
+ if (need_policy == FALSE &&
+ (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+ XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+ need_policy = TRUE;
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+
+ if (mask & ATTR_MODE) {
+ if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+ err = secpolicy_setid_setsticky_clear(ip, vap,
+ &oldva, cr);
+ if (err)
+ goto out3;
+
+ trim_mask |= ATTR_MODE;
+ } else {
+ need_policy = TRUE;
+ }
+ }
+
+ if (need_policy) {
+ /*
+ * If trim_mask is set then take ownership
+ * has been granted or write_acl is present and user
+ * has the ability to modify mode. In that case remove
+ * UID|GID and or MODE from mask so that
+ * secpolicy_vnode_setattr() doesn't revoke it.
+ */
+
+ if (trim_mask) {
+ saved_mask = vap->va_mask;
+ vap->va_mask &= ~trim_mask;
+ }
+ err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
+ (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+ if (err)
+ goto out3;
+
+ if (trim_mask)
+ vap->va_mask |= saved_mask;
+ }
+
+ /*
+ * secpolicy_vnode_setattr, or take ownership may have
+ * changed va_mask
+ */
+ mask = vap->va_mask;
+
+ if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
+ handle_eadir = B_TRUE;
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+
+ if (err == 0 && xattr_obj) {
+ err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
+ if (err)
+ goto out2;
+ }
+ if (mask & ATTR_UID) {
+ new_kuid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+ if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
+ zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+ new_kuid)) {
+ if (attrzp)
+ iput(ZTOI(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (mask & ATTR_GID) {
+ new_kgid = zfs_fuid_create(zfsvfs,
+ (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
+ if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
+ zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ new_kgid)) {
+ if (attrzp)
+ iput(ZTOI(attrzp));
+ err = SET_ERROR(EDQUOT);
+ goto out2;
+ }
+ }
+
+ if (projid != ZFS_INVALID_PROJID &&
+ zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+ if (attrzp)
+ iput(ZTOI(attrzp));
+ err = EDQUOT;
+ goto out2;
+ }
+ }
+ tx = dmu_tx_create(os);
+
+ if (mask & ATTR_MODE) {
+ uint64_t pmode = zp->z_mode;
+ uint64_t acl_obj;
+ new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+ zfs_acl_chmod_setattr(zp, &aclp, new_mode);
+
+ mutex_enter(&zp->z_lock);
+ if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+ /*
+ * Are we upgrading ACL from old V0 format
+ * to V1 format?
+ */
+ if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+ zfs_znode_acl_version(zp) ==
+ ZFS_ACL_VERSION_INITIAL) {
+ dmu_tx_hold_free(tx, acl_obj, 0,
+ DMU_OBJECT_END);
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ } else {
+ dmu_tx_hold_write(tx, acl_obj, 0,
+ aclp->z_acl_bytes);
+ }
+ } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, aclp->z_acl_bytes);
+ }
+ mutex_exit(&zp->z_lock);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ } else {
+ if (((mask & ATTR_XVATTR) &&
+ XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+ (projid != ZFS_INVALID_PROJID &&
+ !(zp->z_pflags & ZFS_PROJID)))
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+ else
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ }
+
+ if (attrzp) {
+ dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+ }
+
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
+ goto out;
+
+ count = 0;
+ /*
+ * Set each attribute requested.
+ * We group settings according to the locks they need to acquire.
+ *
+ * Note: you cannot set ctime directly, although it will be
+ * updated as a side-effect of calling this function.
+ */
+
+ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+ /*
+ * For the existed object that is upgraded from old system,
+ * its on-disk layout has no slot for the project ID attribute.
+ * But quota accounting logic needs to access related slots by
+ * offset directly. So we need to adjust old objects' layout
+ * to make the project ID to some unified and fixed offset.
+ */
+ if (attrzp)
+ err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+ if (err == 0)
+ err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+ if (unlikely(err == EEXIST))
+ err = 0;
+ else if (err != 0)
+ goto out;
+ else
+ projid = ZFS_INVALID_PROJID;
+ }
+
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_enter(&zp->z_acl_lock);
+ mutex_enter(&zp->z_lock);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+
+ if (attrzp) {
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_enter(&attrzp->z_acl_lock);
+ mutex_enter(&attrzp->z_lock);
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+ sizeof (attrzp->z_pflags));
+ if (projid != ZFS_INVALID_PROJID) {
+ attrzp->z_projid = projid;
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+ sizeof (attrzp->z_projid));
+ }
+ }
+
+ if (mask & (ATTR_UID|ATTR_GID)) {
+
+ if (mask & ATTR_UID) {
+ ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
+ new_uid = zfs_uid_read(ZTOI(zp));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &new_uid, sizeof (new_uid));
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+ sizeof (new_uid));
+ ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
+ }
+ }
+
+ if (mask & ATTR_GID) {
+ ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
+ new_gid = zfs_gid_read(ZTOI(zp));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+ NULL, &new_gid, sizeof (new_gid));
+ if (attrzp) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+ sizeof (new_gid));
+ ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
+ }
+ }
+ if (!(mask & ATTR_MODE)) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+ NULL, &new_mode, sizeof (new_mode));
+ new_mode = zp->z_mode;
+ }
+ err = zfs_acl_chown_setattr(zp);
+ ASSERT(err == 0);
+ if (attrzp) {
+ err = zfs_acl_chown_setattr(attrzp);
+ ASSERT(err == 0);
+ }
+ }
+
+ if (mask & ATTR_MODE) {
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &new_mode, sizeof (new_mode));
+ zp->z_mode = ZTOI(zp)->i_mode = new_mode;
+ ASSERT3P(aclp, !=, NULL);
+ err = zfs_aclset_common(zp, aclp, cr, tx);
+ ASSERT0(err);
+ if (zp->z_acl_cached)
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = aclp;
+ aclp = NULL;
+ }
+
+ if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
+ zp->z_atime_dirty = B_FALSE;
+ ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, sizeof (atime));
+ }
+
+ if (mask & (ATTR_MTIME | ATTR_SIZE)) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ ZTOI(zp)->i_mtime = zpl_inode_timespec_trunc(vap->va_mtime,
+ ZTOI(zp)->i_sb->s_time_gran);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ mtime, sizeof (mtime));
+ }
+
+ if (mask & (ATTR_CTIME | ATTR_SIZE)) {
+ ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
+ ZTOI(zp)->i_ctime = zpl_inode_timespec_trunc(vap->va_ctime,
+ ZTOI(zp)->i_sb->s_time_gran);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ ctime, sizeof (ctime));
+ }
+
+ if (projid != ZFS_INVALID_PROJID) {
+ zp->z_projid = projid;
+ SA_ADD_BULK_ATTR(bulk, count,
+ SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+ sizeof (zp->z_projid));
+ }
+
+ if (attrzp && mask) {
+ SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+ SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
+ sizeof (ctime));
+ }
+
+ /*
+ * Do this after setting timestamps to prevent timestamp
+ * update from toggling bit
+ */
+
+ if (xoap && (mask & ATTR_XVATTR)) {
+
+ /*
+ * restore trimmed off masks
+ * so that return masks can be set for caller.
+ */
+
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
+ XVA_SET_REQ(xvap, XAT_APPENDONLY);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
+ XVA_SET_REQ(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
+ XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
+ XVA_SET_REQ(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
+ XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
+ XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
+ XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+ }
+
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+ ASSERT(S_ISREG(ip->i_mode));
+
+ zfs_xvattr_set(zp, xvap, tx);
+ }
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ if (mask != 0)
+ zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+ mutex_exit(&zp->z_lock);
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_exit(&zp->z_acl_lock);
+
+ if (attrzp) {
+ if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+ mutex_exit(&attrzp->z_acl_lock);
+ mutex_exit(&attrzp->z_lock);
+ }
+out:
+ if (err == 0 && xattr_count > 0) {
+ err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+ xattr_count, tx);
+ ASSERT(err2 == 0);
+ }
+
+ if (aclp)
+ zfs_acl_free(aclp);
+
+ if (fuidp) {
+ zfs_fuid_info_free(fuidp);
+ fuidp = NULL;
+ }
+
+ if (err) {
+ dmu_tx_abort(tx);
+ if (attrzp)
+ iput(ZTOI(attrzp));
+ if (err == ERESTART)
+ goto top;
+ } else {
+ if (count > 0)
+ err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ dmu_tx_commit(tx);
+ if (attrzp) {
+ if (err2 == 0 && handle_eadir)
+ err2 = zfs_setattr_dir(attrzp);
+ iput(ZTOI(attrzp));
+ }
+ zfs_inode_update(zp);
+ }
+
+out2:
+ if (os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+out3:
+ kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
+ kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
+ kmem_free(tmpxvattr, sizeof (xvattr_t));
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+typedef struct zfs_zlock {
+ krwlock_t *zl_rwlock; /* lock we acquired */
+ znode_t *zl_znode; /* znode we held */
+ struct zfs_zlock *zl_next; /* next in list */
+} zfs_zlock_t;
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+
+ while ((zl = *zlpp) != NULL) {
+ if (zl->zl_znode != NULL)
+ zfs_iput_async(ZTOI(zl->zl_znode));
+ rw_exit(zl->zl_rwlock);
+ *zlpp = zl->zl_next;
+ kmem_free(zl, sizeof (*zl));
+ }
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+ zfs_zlock_t *zl;
+ znode_t *zp = tdzp;
+ uint64_t rootid = ZTOZSB(zp)->z_root;
+ uint64_t oidp = zp->z_id;
+ krwlock_t *rwlp = &szp->z_parent_lock;
+ krw_t rw = RW_WRITER;
+
+ /*
+ * First pass write-locks szp and compares to zp->z_id.
+ * Later passes read-lock zp and compare to zp->z_parent.
+ */
+ do {
+ if (!rw_tryenter(rwlp, rw)) {
+ /*
+ * Another thread is renaming in this path.
+ * Note that if we are a WRITER, we don't have any
+ * parent_locks held yet.
+ */
+ if (rw == RW_READER && zp->z_id > szp->z_id) {
+ /*
+ * Drop our locks and restart
+ */
+ zfs_rename_unlock(&zl);
+ *zlpp = NULL;
+ zp = tdzp;
+ oidp = zp->z_id;
+ rwlp = &szp->z_parent_lock;
+ rw = RW_WRITER;
+ continue;
+ } else {
+ /*
+ * Wait for other thread to drop its locks
+ */
+ rw_enter(rwlp, rw);
+ }
+ }
+
+ zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+ zl->zl_rwlock = rwlp;
+ zl->zl_znode = NULL;
+ zl->zl_next = *zlpp;
+ *zlpp = zl;
+
+ if (oidp == szp->z_id) /* We're a descendant of szp */
+ return (SET_ERROR(EINVAL));
+
+ if (oidp == rootid) /* We've hit the top */
+ return (0);
+
+ if (rw == RW_READER) { /* i.e. not the first pass */
+ int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
+ if (error)
+ return (error);
+ zl->zl_znode = zp;
+ }
+ (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
+ &oidp, sizeof (oidp));
+ rwlp = &zp->z_parent_lock;
+ rw = RW_READER;
+
+ } while (zp->z_id != sdzp->z_id);
+
+ return (0);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory. Change the entry name as indicated.
+ *
+ * IN: sdip - Source directory containing the "old entry".
+ * snm - Old entry name.
+ * tdip - Target directory to contain the "new entry".
+ * tnm - New entry name.
+ * cr - credentials of caller.
+ * flags - case flags
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * sdip,tdip - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm,
+ cred_t *cr, int flags)
+{
+ znode_t *tdzp, *szp, *tzp;
+ znode_t *sdzp = ITOZ(sdip);
+ zfsvfs_t *zfsvfs = ITOZSB(sdip);
+ zilog_t *zilog;
+ zfs_dirlock_t *sdl, *tdl;
+ dmu_tx_t *tx;
+ zfs_zlock_t *zl;
+ int cmp, serr, terr;
+ int error = 0;
+ int zflg = 0;
+ boolean_t waited = B_FALSE;
+
+ if (snm == NULL || tnm == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(sdzp);
+ zilog = zfsvfs->z_log;
+
+ tdzp = ITOZ(tdip);
+ ZFS_VERIFY_ZP(tdzp);
+
+ /*
+ * We check i_sb because snapshots and the ctldir must have different
+ * super blocks.
+ */
+ if (tdip->i_sb != sdip->i_sb || zfsctl_is_node(tdip)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(tnm,
+ strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+
+ if (flags & FIGNORECASE)
+ zflg |= ZCILOOK;
+
+top:
+ szp = NULL;
+ tzp = NULL;
+ zl = NULL;
+
+ /*
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
+ */
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Lock source and target directory entries. To prevent deadlock,
+ * a lock ordering must be defined. We lock the directory with
+ * the smallest object id first, or if it's a tie, the one with
+ * the lexically first name.
+ */
+ if (sdzp->z_id < tdzp->z_id) {
+ cmp = -1;
+ } else if (sdzp->z_id > tdzp->z_id) {
+ cmp = 1;
+ } else {
+ /*
+ * First compare the two name arguments without
+ * considering any case folding.
+ */
+ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
+
+ cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
+ ASSERT(error == 0 || !zfsvfs->z_utf8);
+ if (cmp == 0) {
+ /*
+ * POSIX: "If the old argument and the new argument
+ * both refer to links to the same existing file,
+ * the rename() function shall return successfully
+ * and perform no other action."
+ */
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+ /*
+ * If the file system is case-folding, then we may
+ * have some more checking to do. A case-folding file
+ * system is either supporting mixed case sensitivity
+ * access or is completely case-insensitive. Note
+ * that the file system is always case preserving.
+ *
+ * In mixed sensitivity mode case sensitive behavior
+ * is the default. FIGNORECASE must be used to
+ * explicitly request case insensitive behavior.
+ *
+ * If the source and target names provided differ only
+ * by case (e.g., a request to rename 'tim' to 'Tim'),
+ * we will treat this as a special case in the
+ * case-insensitive mode: as long as the source name
+ * is an exact match, we will allow this to proceed as
+ * a name-change request.
+ */
+ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+ (zfsvfs->z_case == ZFS_CASE_MIXED &&
+ flags & FIGNORECASE)) &&
+ u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
+ &error) == 0) {
+ /*
+ * case preserving rename request, require exact
+ * name matches
+ */
+ zflg |= ZCIEXACT;
+ zflg &= ~ZCILOOK;
+ }
+ }
+
+ /*
+ * If the source and destination directories are the same, we should
+ * grab the z_name_lock of that directory only once.
+ */
+ if (sdzp == tdzp) {
+ zflg |= ZHAVELOCK;
+ rw_enter(&sdzp->z_name_lock, RW_READER);
+ }
+
+ if (cmp < 0) {
+ serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
+ ZEXISTS | zflg, NULL, NULL);
+ terr = zfs_dirent_lock(&tdl,
+ tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
+ } else {
+ terr = zfs_dirent_lock(&tdl,
+ tdzp, tnm, &tzp, zflg, NULL, NULL);
+ serr = zfs_dirent_lock(&sdl,
+ sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
+ NULL, NULL);
+ }
+
+ if (serr) {
+ /*
+ * Source entry invalid or not there.
+ */
+ if (!terr) {
+ zfs_dirent_unlock(tdl);
+ if (tzp)
+ iput(ZTOI(tzp));
+ }
+
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (strcmp(snm, "..") == 0)
+ serr = EINVAL;
+ ZFS_EXIT(zfsvfs);
+ return (serr);
+ }
+ if (terr) {
+ zfs_dirent_unlock(sdl);
+ iput(ZTOI(szp));
+
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (strcmp(tnm, "..") == 0)
+ terr = EINVAL;
+ ZFS_EXIT(zfsvfs);
+ return (terr);
+ }
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow renames into our tree when the project
+ * IDs are the same.
+ */
+ if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+ tdzp->z_projid != szp->z_projid) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
+ /*
+ * Must have write access at the source to remove the old entry
+ * and write access at the target to create the new entry.
+ * Note that if target and source are the same, this can be
+ * done in a single check.
+ */
+
+ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+ goto out;
+
+ if (S_ISDIR(ZTOI(szp)->i_mode)) {
+ /*
+ * Check to make sure rename is valid.
+ * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+ */
+ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
+ goto out;
+ }
+
+ /*
+ * Does target exist?
+ */
+ if (tzp) {
+ /*
+ * Source and target must be the same type.
+ */
+ if (S_ISDIR(ZTOI(szp)->i_mode)) {
+ if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
+ error = SET_ERROR(ENOTDIR);
+ goto out;
+ }
+ } else {
+ if (S_ISDIR(ZTOI(tzp)->i_mode)) {
+ error = SET_ERROR(EISDIR);
+ goto out;
+ }
+ }
+ /*
+ * POSIX dictates that when the source and target
+ * entries refer to the same file object, rename
+ * must do nothing and exit without error.
+ */
+ if (szp->z_id == tzp->z_id) {
+ error = 0;
+ goto out;
+ }
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp) {
+ dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tdzp);
+ }
+ if (tzp) {
+ dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, tzp);
+ }
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ iput(ZTOI(szp));
+ if (tzp)
+ iput(ZTOI(tzp));
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ iput(ZTOI(szp));
+ if (tzp)
+ iput(ZTOI(tzp));
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (tzp) /* Attempt to remove the existing target */
+ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+
+ if (error == 0) {
+ error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ if (error == 0) {
+ szp->z_pflags |= ZFS_AV_MODIFIED;
+ if (tdzp->z_pflags & ZFS_PROJINHERIT)
+ szp->z_pflags |= ZFS_PROJINHERIT;
+
+ error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+ (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+ ASSERT0(error);
+
+ error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ if (error == 0) {
+ zfs_log_rename(zilog, tx, TX_RENAME |
+ (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+ sdl->dl_name, tdzp, tdl->dl_name, szp);
+ } else {
+ /*
+ * At this point, we have successfully created
+ * the target name, but have failed to remove
+ * the source name. Since the create was done
+ * with the ZRENAMING flag, there are
+ * complications; for one, the link count is
+ * wrong. The easiest way to deal with this
+ * is to remove the newly created target, and
+ * return the original error. This must
+ * succeed; fortunately, it is very unlikely to
+ * fail, since we just created it.
+ */
+ VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+ ZRENAMING, NULL), ==, 0);
+ }
+ } else {
+ /*
+ * If we had removed the existing target, subsequent
+ * call to zfs_link_create() to add back the same entry
+ * but, the new dnode (szp) should not fail.
+ */
+ ASSERT(tzp == NULL);
+ }
+ }
+
+ dmu_tx_commit(tx);
+out:
+ if (zl != NULL)
+ zfs_rename_unlock(&zl);
+
+ zfs_dirent_unlock(sdl);
+ zfs_dirent_unlock(tdl);
+
+ zfs_inode_update(sdzp);
+ if (sdzp == tdzp)
+ rw_exit(&sdzp->z_name_lock);
+
+ if (sdzp != tdzp)
+ zfs_inode_update(tdzp);
+
+ zfs_inode_update(szp);
+ iput(ZTOI(szp));
+ if (tzp) {
+ zfs_inode_update(tzp);
+ iput(ZTOI(tzp));
+ }
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ * IN: dip - Directory to contain new symbolic link.
+ * name - Name of directory entry in dip.
+ * vap - Attributes of new entry.
+ * link - Name for new symlink entry.
+ * cr - credentials of caller.
+ * flags - case flags
+ *
+ * OUT: ipp - Inode for new symbolic link.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * dip - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link,
+ struct inode **ipp, cred_t *cr, int flags)
+{
+ znode_t *zp, *dzp = ITOZ(dip);
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = ITOZSB(dip);
+ zilog_t *zilog;
+ uint64_t len = strlen(link);
+ int error;
+ int zflg = ZNEW;
+ zfs_acl_ids_t acl_ids;
+ boolean_t fuid_dirtied;
+ uint64_t txtype = TX_SYMLINK;
+ boolean_t waited = B_FALSE;
+
+ ASSERT(S_ISLNK(vap->va_mode));
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+ NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+ if (flags & FIGNORECASE)
+ zflg |= ZCILOOK;
+
+ if (len > MAXPATHLEN) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENAMETOOLONG));
+ }
+
+ if ((error = zfs_acl_ids_create(dzp, 0,
+ vap, cr, NULL, &acl_ids)) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+top:
+ *ipp = NULL;
+
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+ if (error) {
+ zfs_acl_ids_free(&acl_ids);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
+ zfs_acl_ids_free(&acl_ids);
+ zfs_dirent_unlock(dl);
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EDQUOT));
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE + len);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+ acl_ids.z_aclp->z_acl_bytes);
+ }
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Create a new object for the symlink.
+ * for version 4 ZPL datsets the symlink will be an SA attribute
+ */
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+ link, len, tx);
+ else
+ zfs_sa_symlink(zp, link, len, tx);
+ mutex_exit(&zp->z_lock);
+
+ zp->z_size = len;
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+ &zp->z_size, sizeof (zp->z_size), tx);
+ /*
+ * Insert the new object into the directory.
+ */
+ error = zfs_link_create(dl, zp, tx, ZNEW);
+ if (error != 0) {
+ zfs_znode_delete(zp, tx);
+ remove_inode_hash(ZTOI(zp));
+ } else {
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+
+ zfs_inode_update(dzp);
+ zfs_inode_update(zp);
+ }
+
+ zfs_acl_ids_free(&acl_ids);
+
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ if (error == 0) {
+ *ipp = ZTOI(zp);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+ } else {
+ iput(ZTOI(zp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by ip.
+ *
+ * IN: ip - inode of symbolic link
+ * uio - structure to contain the link path.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_is_sa)
+ error = sa_lookup_uio(zp->z_sa_hdl,
+ SA_ZPL_SYMLINK(zfsvfs), uio);
+ else
+ error = zfs_sa_readlink(zp, uio);
+ mutex_exit(&zp->z_lock);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*
+ * Insert a new entry into directory tdip referencing sip.
+ *
+ * IN: tdip - Directory to contain new entry.
+ * sip - inode of new entry.
+ * name - name of new entry.
+ * cr - credentials of caller.
+ * flags - case flags.
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * tdip - ctime|mtime updated
+ * sip - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr,
+ int flags)
+{
+ znode_t *dzp = ITOZ(tdip);
+ znode_t *tzp, *szp;
+ zfsvfs_t *zfsvfs = ITOZSB(tdip);
+ zilog_t *zilog;
+ zfs_dirlock_t *dl;
+ dmu_tx_t *tx;
+ int error;
+ int zf = ZNEW;
+ uint64_t parent;
+ uid_t owner;
+ boolean_t waited = B_FALSE;
+ boolean_t is_tmpfile = 0;
+ uint64_t txg;
+#ifdef HAVE_TMPFILE
+ is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
+#endif
+ ASSERT(S_ISDIR(tdip->i_mode));
+
+ if (name == NULL)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(dzp);
+ zilog = zfsvfs->z_log;
+
+ /*
+ * POSIX dictates that we return EPERM here.
+ * Better choices include ENOTSUP or EISDIR.
+ */
+ if (S_ISDIR(sip->i_mode)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ szp = ITOZ(sip);
+ ZFS_VERIFY_ZP(szp);
+
+ /*
+ * If we are using project inheritance, means if the directory has
+ * ZFS_PROJINHERIT set, then its descendant directories will inherit
+ * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+ * such case, we only allow hard link creation in our tree when the
+ * project IDs are the same.
+ */
+ if (dzp->z_pflags & ZFS_PROJINHERIT && dzp->z_projid != szp->z_projid) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /*
+ * We check i_sb because snapshots and the ctldir must have different
+ * super blocks.
+ */
+ if (sip->i_sb != tdip->i_sb || zfsctl_is_node(sip)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EXDEV));
+ }
+
+ /* Prevent links to .zfs/shares files */
+
+ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ if (parent == zfsvfs->z_shares_dir) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if (zfsvfs->z_utf8 && u8_validate(name,
+ strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EILSEQ));
+ }
+ if (flags & FIGNORECASE)
+ zf |= ZCILOOK;
+
+ /*
+ * We do not support links between attributes and non-attributes
+ * because of the potential security risk of creating links
+ * into "normal" file space in order to circumvent restrictions
+ * imposed in attribute space.
+ */
+ if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
+ cr, ZFS_OWNER);
+ if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+top:
+ /*
+ * Attempt to lock directory; fail if entry already exists.
+ */
+ error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ if (is_tmpfile)
+ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+ zfs_sa_upgrade_txholds(tx, szp);
+ zfs_sa_upgrade_txholds(tx, dzp);
+ error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ if (error) {
+ zfs_dirent_unlock(dl);
+ if (error == ERESTART) {
+ waited = B_TRUE;
+ dmu_tx_wait(tx);
+ dmu_tx_abort(tx);
+ goto top;
+ }
+ dmu_tx_abort(tx);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ /* unmark z_unlinked so zfs_link_create will not reject */
+ if (is_tmpfile)
+ szp->z_unlinked = B_FALSE;
+ error = zfs_link_create(dl, szp, tx, 0);
+
+ if (error == 0) {
+ uint64_t txtype = TX_LINK;
+ /*
+ * tmpfile is created to be in z_unlinkedobj, so remove it.
+ * Also, we don't log in ZIL, because all previous file
+ * operation on the tmpfile are ignored by ZIL. Instead we
+ * always wait for txg to sync to make sure all previous
+ * operation are sync safe.
+ */
+ if (is_tmpfile) {
+ VERIFY(zap_remove_int(zfsvfs->z_os,
+ zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+ } else {
+ if (flags & FIGNORECASE)
+ txtype |= TX_CI;
+ zfs_log_link(zilog, tx, txtype, dzp, szp, name);
+ }
+ } else if (is_tmpfile) {
+ /* restore z_unlinked since when linking failed */
+ szp->z_unlinked = B_TRUE;
+ }
+ txg = dmu_tx_get_txg(tx);
+ dmu_tx_commit(tx);
+
+ zfs_dirent_unlock(dl);
+
+ if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ if (is_tmpfile)
+ txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
+
+ zfs_inode_update(dzp);
+ zfs_inode_update(szp);
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+static void
+zfs_putpage_commit_cb(void *arg)
+{
+ struct page *pp = arg;
+
+ ClearPageError(pp);
+ end_page_writeback(pp);
+}
+
+/*
+ * Push a page out to disk, once the page is on stable storage the
+ * registered commit callback will be run as notification of completion.
+ *
+ * IN: ip - page mapped for inode.
+ * pp - page to push (page is locked)
+ * wbc - writeback control data
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ loff_t offset;
+ loff_t pgoff;
+ unsigned int pglen;
+ dmu_tx_t *tx;
+ caddr_t va;
+ int err = 0;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int cnt = 0;
+ struct address_space *mapping;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ ASSERT(PageLocked(pp));
+
+ pgoff = page_offset(pp); /* Page byte-offset in file */
+ offset = i_size_read(ip); /* File length in bytes */
+ pglen = MIN(PAGE_SIZE, /* Page length in bytes */
+ P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
+
+ /* Page is beyond end of file */
+ if (pgoff >= offset) {
+ unlock_page(pp);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* Truncate page length to end of file */
+ if (pgoff + pglen > offset)
+ pglen = offset - pgoff;
+
+#if 0
+ /*
+ * FIXME: Allow mmap writes past its quota. The correct fix
+ * is to register a page_mkwrite() handler to count the page
+ * against its quota when it is about to be dirtied.
+ */
+ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
+ KUID_TO_SUID(ip->i_uid)) ||
+ zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+ KGID_TO_SGID(ip->i_gid)) ||
+ (zp->z_projid != ZFS_DEFAULT_PROJID &&
+ zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+ zp->z_projid))) {
+ err = EDQUOT;
+ }
+#endif
+
+ /*
+ * The ordering here is critical and must adhere to the following
+ * rules in order to avoid deadlocking in either zfs_read() or
+ * zfs_free_range() due to a lock inversion.
+ *
+ * 1) The page must be unlocked prior to acquiring the range lock.
+ * This is critical because zfs_read() calls find_lock_page()
+ * which may block on the page lock while holding the range lock.
+ *
+ * 2) Before setting or clearing write back on a page the range lock
+ * must be held in order to prevent a lock inversion with the
+ * zfs_free_range() function.
+ *
+ * This presents a problem because upon entering this function the
+ * page lock is already held. To safely acquire the range lock the
+ * page lock must be dropped. This creates a window where another
+ * process could truncate, invalidate, dirty, or write out the page.
+ *
+ * Therefore, after successfully reacquiring the range and page locks
+ * the current page state is checked. In the common case everything
+ * will be as is expected and it can be written out. However, if
+ * the page state has changed it must be handled accordingly.
+ */
+ mapping = pp->mapping;
+ redirty_page_for_writepage(wbc, pp);
+ unlock_page(pp);
+
+ locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
+ pgoff, pglen, RL_WRITER);
+ lock_page(pp);
+
+ /* Page mapping changed or it was no longer dirty, we're done */
+ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
+ unlock_page(pp);
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* Another process started write block if required */
+ if (PageWriteback(pp)) {
+ unlock_page(pp);
+ rangelock_exit(lr);
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ if (PageWriteback(pp))
+ wait_on_page_bit(pp, PG_writeback);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /* Clear the dirty flag the required locks are held */
+ if (!clear_page_dirty_for_io(pp)) {
+ unlock_page(pp);
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ /*
+ * Counterpart for redirty_page_for_writepage() above. This page
+ * was in fact not skipped and should not be counted as if it were.
+ */
+ wbc->pages_skipped--;
+ set_page_writeback(pp);
+ unlock_page(pp);
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ err = dmu_tx_assign(tx, TXG_NOWAIT);
+ if (err != 0) {
+ if (err == ERESTART)
+ dmu_tx_wait(tx);
+
+ dmu_tx_abort(tx);
+ __set_page_dirty_nobuffers(pp);
+ ClearPageError(pp);
+ end_page_writeback(pp);
+ rangelock_exit(lr);
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+
+ va = kmap(pp);
+ ASSERT3U(pglen, <=, PAGE_SIZE);
+ dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
+ kunmap(pp);
+
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+
+ /* Preserve the mtime and ctime provided by the inode */
+ ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+ ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+ zp->z_atime_dirty = B_FALSE;
+ zp->z_seq++;
+
+ err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
+ zfs_putpage_commit_cb, pp);
+ dmu_tx_commit(tx);
+
+ rangelock_exit(lr);
+
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ /*
+ * Note that this is rarely called under writepages(), because
+ * writepages() normally handles the entire commit for
+ * performance reasons.
+ */
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Update the system attributes when the inode has been dirtied. For the
+ * moment we only update the mode, atime, mtime, and ctime.
+ */
+int
+zfs_dirty_inode(struct inode *ip, int flags)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ dmu_tx_t *tx;
+ uint64_t mode, atime[2], mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[4];
+ int error = 0;
+ int cnt = 0;
+
+ if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+#ifdef I_DIRTY_TIME
+ /*
+ * This is the lazytime semantic introduced in Linux 4.0
+ * This flag will only be called from update_time when lazytime is set.
+ * (Note, I_DIRTY_SYNC will also set if not lazytime)
+ * Fortunately mtime and ctime are managed within ZFS itself, so we
+ * only need to dirty atime.
+ */
+ if (flags == I_DIRTY_TIME) {
+ zp->z_atime_dirty = B_TRUE;
+ goto out;
+ }
+#endif
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ mutex_enter(&zp->z_lock);
+ zp->z_atime_dirty = B_FALSE;
+
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+ /* Preserve the mode, mtime and ctime provided by the inode */
+ ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+ ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+ mode = ip->i_mode;
+
+ zp->z_mode = mode;
+
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+ mutex_exit(&zp->z_lock);
+
+ dmu_tx_commit(tx);
+out:
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_inactive(struct inode *ip)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint64_t atime[2];
+ int error;
+ int need_unlock = 0;
+
+ /* Only read lock if we haven't already write locked, e.g. rollback */
+ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
+ need_unlock = 1;
+ rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+ }
+ if (zp->z_sa_hdl == NULL) {
+ if (need_unlock)
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+ return;
+ }
+
+ if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
+ dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ ZFS_TIME_ENCODE(&ip->i_atime, atime);
+ mutex_enter(&zp->z_lock);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+ (void *)&atime, sizeof (atime), tx);
+ zp->z_atime_dirty = B_FALSE;
+ mutex_exit(&zp->z_lock);
+ dmu_tx_commit(tx);
+ }
+ }
+
+ zfs_zinactive(zp);
+ if (need_unlock)
+ rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
+
+/*
+ * Bounds-check the seek operation.
+ *
+ * IN: ip - inode seeking within
+ * ooff - old file offset
+ * noffp - pointer to new file offset
+ *
+ * RETURN: 0 if success
+ * EINVAL if new offset invalid
+ */
+/* ARGSUSED */
+int
+zfs_seek(struct inode *ip, offset_t ooff, offset_t *noffp)
+{
+ if (S_ISDIR(ip->i_mode))
+ return (0);
+ return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
+}
+
+/*
+ * Fill pages with data from the disk.
+ */
+static int
+zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ objset_t *os;
+ struct page *cur_pp;
+ u_offset_t io_off, total;
+ size_t io_len;
+ loff_t i_size;
+ unsigned page_idx;
+ int err;
+
+ os = zfsvfs->z_os;
+ io_len = nr_pages << PAGE_SHIFT;
+ i_size = i_size_read(ip);
+ io_off = page_offset(pl[0]);
+
+ if (io_off + io_len > i_size)
+ io_len = i_size - io_off;
+
+ /*
+ * Iterate over list of pages and read each page individually.
+ */
+ page_idx = 0;
+ for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+ caddr_t va;
+
+ cur_pp = pl[page_idx++];
+ va = kmap(cur_pp);
+ err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+ DMU_READ_PREFETCH);
+ kunmap(cur_pp);
+ if (err) {
+ /* convert checksum errors into IO errors */
+ if (err == ECKSUM)
+ err = SET_ERROR(EIO);
+ return (err);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Uses zfs_fillpage to read data from the file and fill the pages.
+ *
+ * IN: ip - inode of file to get data from.
+ * pl - list of pages to read
+ * nr_pages - number of pages to read
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * vp - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int err;
+
+ if (pl == NULL)
+ return (0);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ err = zfs_fillpage(ip, pl, nr_pages);
+
+ ZFS_EXIT(zfsvfs);
+ return (err);
+}
+
+/*
+ * Check ZFS specific permissions to memory map a section of a file.
+ *
+ * IN: ip - inode of the file to mmap
+ * off - file offset
+ * addrp - start address in memory region
+ * len - length of memory region
+ * vm_flags- address flags
+ *
+ * RETURN: 0 if success
+ * error code if failure
+ */
+/*ARGSUSED*/
+int
+zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
+ unsigned long vm_flags)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((vm_flags & VM_WRITE) && (zp->z_pflags &
+ (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EPERM));
+ }
+
+ if ((vm_flags & (VM_READ | VM_EXEC)) &&
+ (zp->z_pflags & ZFS_AV_QUARANTINED)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EACCES));
+ }
+
+ if (off < 0 || len > MAXOFFSET_T - off) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(ENXIO));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*
+ * convoff - converts the given data (start, whence) to the
+ * given whence.
+ */
+int
+convoff(struct inode *ip, flock64_t *lckdat, int whence, offset_t offset)
+{
+ vattr_t vap;
+ int error;
+
+ if ((lckdat->l_whence == SEEK_END) || (whence == SEEK_END)) {
+ if ((error = zfs_getattr(ip, &vap, 0, CRED())))
+ return (error);
+ }
+
+ switch (lckdat->l_whence) {
+ case SEEK_CUR:
+ lckdat->l_start += offset;
+ break;
+ case SEEK_END:
+ lckdat->l_start += vap.va_size;
+ /* FALLTHRU */
+ case SEEK_SET:
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ if (lckdat->l_start < 0)
+ return (SET_ERROR(EINVAL));
+
+ switch (whence) {
+ case SEEK_CUR:
+ lckdat->l_start -= offset;
+ break;
+ case SEEK_END:
+ lckdat->l_start -= vap.va_size;
+ /* FALLTHRU */
+ case SEEK_SET:
+ break;
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+
+ lckdat->l_whence = (short)whence;
+ return (0);
+}
+
+/*
+ * Free or allocate space in a file. Currently, this function only
+ * supports the `F_FREESP' command. However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ * IN: ip - inode of file to free data in.
+ * cmd - action to take (only F_FREESP supported).
+ * bfp - section of file to free/alloc.
+ * flag - current file open mode flags.
+ * offset - current file offset.
+ * cr - credentials of caller.
+ *
+ * RETURN: 0 on success, error code on failure.
+ *
+ * Timestamps:
+ * ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(struct inode *ip, int cmd, flock64_t *bfp, int flag,
+ offset_t offset, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint64_t off, len;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if (cmd != F_FREESP) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfs_is_readonly(zfsvfs)) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ if ((error = convoff(ip, bfp, SEEK_SET, offset))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ if (bfp->l_len < 0) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Permissions aren't checked on Solaris because on this OS
+ * zfs_space() can only be called with an opened file handle.
+ * On Linux we can get here through truncate_range() which
+ * operates directly on inodes, so we need to check access rights.
+ */
+ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ off = bfp->l_start;
+ len = bfp->l_len; /* 0 means from off to end of file */
+
+ error = zfs_freesp(zp, off, len, flag, TRUE);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_fid(struct inode *ip, fid_t *fidp)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ uint32_t gen;
+ uint64_t gen64;
+ uint64_t object = zp->z_id;
+ zfid_short_t *zfid;
+ int size, i, error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+ &gen64, sizeof (uint64_t))) != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ gen = (uint32_t)gen64;
+
+ size = SHORT_FID_LEN;
+
+ zfid = (zfid_short_t *)fidp;
+
+ zfid->zf_len = size;
+
+ for (i = 0; i < sizeof (zfid->zf_object); i++)
+ zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+ /* Must have a non-zero generation number to distinguish from .zfs */
+ if (gen == 0)
+ gen = 1;
+ for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+int
+zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int error;
+ boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+ zilog_t *zilog = zfsvfs->z_log;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zilog, 0);
+
+ ZFS_EXIT(zfsvfs);
+ return (error);
+}
+
+#ifdef HAVE_UIO_ZEROCOPY
+/*
+ * The smallest read we may consider to loan out an arcbuf.
+ * This must be a power of 2.
+ */
+int zcr_blksz_min = (1 << 10); /* 1K */
+/*
+ * If set to less than the file block size, allow loaning out of an
+ * arcbuf for a partial block read. This must be a power of 2.
+ */
+int zcr_blksz_max = (1 << 17); /* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ int max_blksz = zfsvfs->z_max_blksz;
+ uio_t *uio = &xuio->xu_uio;
+ ssize_t size = uio->uio_resid;
+ offset_t offset = uio->uio_loffset;
+ int blksz;
+ int fullblk, i;
+ arc_buf_t *abuf;
+ ssize_t maxsize;
+ int preamble, postamble;
+
+ if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+ return (SET_ERROR(EINVAL));
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ switch (ioflag) {
+ case UIO_WRITE:
+ /*
+ * Loan out an arc_buf for write if write size is bigger than
+ * max_blksz, and the file's block size is also max_blksz.
+ */
+ blksz = max_blksz;
+ if (size < blksz || zp->z_blksz != blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+ /*
+ * Caller requests buffers for write before knowing where the
+ * write offset might be (e.g. NFS TCP write).
+ */
+ if (offset == -1) {
+ preamble = 0;
+ } else {
+ preamble = P2PHASE(offset, blksz);
+ if (preamble) {
+ preamble = blksz - preamble;
+ size -= preamble;
+ }
+ }
+
+ postamble = P2PHASE(size, blksz);
+ size -= postamble;
+
+ fullblk = size / blksz;
+ (void) dmu_xuio_init(xuio,
+ (preamble != 0) + fullblk + (postamble != 0));
+
+ /*
+ * Have to fix iov base/len for partial buffers. They
+ * currently represent full arc_buf's.
+ */
+ if (preamble) {
+ /* data begins in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf,
+ blksz - preamble, preamble);
+ }
+
+ for (i = 0; i < fullblk; i++) {
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf, 0, blksz);
+ }
+
+ if (postamble) {
+ /* data ends in the middle of the arc_buf */
+ abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+ blksz);
+ ASSERT(abuf);
+ (void) dmu_xuio_add(xuio, abuf, 0, postamble);
+ }
+ break;
+ case UIO_READ:
+ /*
+ * Loan out an arc_buf for read if the read size is larger than
+ * the current file block size. Block alignment is not
+ * considered. Partial arc_buf will be loaned out for read.
+ */
+ blksz = zp->z_blksz;
+ if (blksz < zcr_blksz_min)
+ blksz = zcr_blksz_min;
+ if (blksz > zcr_blksz_max)
+ blksz = zcr_blksz_max;
+ /* avoid potential complexity of dealing with it */
+ if (blksz > max_blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ maxsize = zp->z_size - uio->uio_loffset;
+ if (size > maxsize)
+ size = maxsize;
+
+ if (size < blksz) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+ break;
+ default:
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EINVAL));
+ }
+
+ uio->uio_extflg = UIO_XUIO;
+ XUIO_XUZC_RW(xuio) = ioflag;
+ ZFS_EXIT(zfsvfs);
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr)
+{
+ int i;
+ arc_buf_t *abuf;
+ int ioflag = XUIO_XUZC_RW(xuio);
+
+ ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+ i = dmu_xuio_cnt(xuio);
+ while (i-- > 0) {
+ abuf = dmu_xuio_arcbuf(xuio, i);
+ /*
+ * if abuf == NULL, it must be a write buffer
+ * that has been returned in zfs_write().
+ */
+ if (abuf)
+ dmu_return_arcbuf(abuf);
+ ASSERT(abuf || ioflag == UIO_WRITE);
+ }
+
+ dmu_xuio_fini(xuio);
+ return (0);
+}
+#endif /* HAVE_UIO_ZEROCOPY */
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_open);
+EXPORT_SYMBOL(zfs_close);
+EXPORT_SYMBOL(zfs_read);
+EXPORT_SYMBOL(zfs_write);
+EXPORT_SYMBOL(zfs_access);
+EXPORT_SYMBOL(zfs_lookup);
+EXPORT_SYMBOL(zfs_create);
+EXPORT_SYMBOL(zfs_tmpfile);
+EXPORT_SYMBOL(zfs_remove);
+EXPORT_SYMBOL(zfs_mkdir);
+EXPORT_SYMBOL(zfs_rmdir);
+EXPORT_SYMBOL(zfs_readdir);
+EXPORT_SYMBOL(zfs_fsync);
+EXPORT_SYMBOL(zfs_getattr);
+EXPORT_SYMBOL(zfs_getattr_fast);
+EXPORT_SYMBOL(zfs_setattr);
+EXPORT_SYMBOL(zfs_rename);
+EXPORT_SYMBOL(zfs_symlink);
+EXPORT_SYMBOL(zfs_readlink);
+EXPORT_SYMBOL(zfs_link);
+EXPORT_SYMBOL(zfs_inactive);
+EXPORT_SYMBOL(zfs_space);
+EXPORT_SYMBOL(zfs_fid);
+EXPORT_SYMBOL(zfs_getsecattr);
+EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_getpage);
+EXPORT_SYMBOL(zfs_putpage);
+EXPORT_SYMBOL(zfs_dirty_inode);
+EXPORT_SYMBOL(zfs_map);
+
+/* BEGIN CSTYLED */
+module_param(zfs_delete_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
+module_param(zfs_read_chunk_size, ulong, 0644);
+MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk");
+/* END CSTYLED */
+
+#endif
diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c
new file mode 100644
index 000000000..549c701a0
--- /dev/null
+++ b/module/os/linux/zfs/zfs_znode.c
@@ -0,0 +1,2234 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/mode.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zpl.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+
+static kmem_cache_t *znode_cache = NULL;
+static kmem_cache_t *znode_hold_cache = NULL;
+unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
+
+/*
+ * This is used by the test suite so that it can delay znodes from being
+ * freed in order to inspect the unlinked set.
+ */
+int zfs_unlink_suspend_progress = 0;
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER. This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(locked_range_t *new, void *arg)
+{
+ znode_t *zp = arg;
+
+ /*
+ * If in append mode, convert to writer and lock starting at the
+ * current end of file.
+ */
+ if (new->lr_type == RL_APPEND) {
+ new->lr_offset = zp->z_size;
+ new->lr_type = RL_WRITER;
+ }
+
+ /*
+ * If we need to grow the block size then lock the whole file range.
+ */
+ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+ if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+ zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+ new->lr_offset = 0;
+ new->lr_length = UINT64_MAX;
+ }
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_t *zp = buf;
+
+ inode_init_once(ZTOI(zp));
+ list_link_init(&zp->z_link_node);
+
+ mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+ rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
+ mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+ rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
+
+ rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+ zp->z_dirlocks = NULL;
+ zp->z_acl_cached = NULL;
+ zp->z_xattr_cached = NULL;
+ zp->z_xattr_parent = 0;
+ zp->z_moved = B_FALSE;
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+ znode_t *zp = buf;
+
+ ASSERT(!list_link_active(&zp->z_link_node));
+ mutex_destroy(&zp->z_lock);
+ rw_destroy(&zp->z_parent_lock);
+ rw_destroy(&zp->z_name_lock);
+ mutex_destroy(&zp->z_acl_lock);
+ rw_destroy(&zp->z_xattr_lock);
+ rangelock_fini(&zp->z_rangelock);
+
+ ASSERT(zp->z_dirlocks == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ ASSERT(zp->z_xattr_cached == NULL);
+}
+
+static int
+zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
+{
+ znode_hold_t *zh = buf;
+
+ mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
+ zfs_refcount_create(&zh->zh_refcount);
+ zh->zh_obj = ZFS_NO_OBJECT;
+
+ return (0);
+}
+
+static void
+zfs_znode_hold_cache_destructor(void *buf, void *arg)
+{
+ znode_hold_t *zh = buf;
+
+ mutex_destroy(&zh->zh_lock);
+ zfs_refcount_destroy(&zh->zh_refcount);
+}
+
+void
+zfs_znode_init(void)
+{
+ /*
+ * Initialize zcache. The KMC_SLAB hint is used in order that it be
+ * backed by kmalloc() when on the Linux slab in order that any
+ * wait_on_bit() operations on the related inode operate properly.
+ */
+ ASSERT(znode_cache == NULL);
+ znode_cache = kmem_cache_create("zfs_znode_cache",
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
+ zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
+
+ ASSERT(znode_hold_cache == NULL);
+ znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
+ sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
+ zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+ /*
+ * Cleanup zcache
+ */
+ if (znode_cache)
+ kmem_cache_destroy(znode_cache);
+ znode_cache = NULL;
+
+ if (znode_hold_cache)
+ kmem_cache_destroy(znode_hold_cache);
+ znode_hold_cache = NULL;
+}
+
+/*
+ * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
+ * serialize access to a znode and its SA buffer while the object is being
+ * created or destroyed. This kind of locking would normally reside in the
+ * znode itself but in this case that's impossible because the znode and SA
+ * buffer may not yet exist. Therefore the locking is handled externally
+ * with an array of mutexs and AVLs trees which contain per-object locks.
+ *
+ * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
+ * in to the correct AVL tree and finally the per-object lock is held. In
+ * zfs_znode_hold_exit() the process is reversed. The per-object lock is
+ * released, removed from the AVL tree and destroyed if there are no waiters.
+ *
+ * This scheme has two important properties:
+ *
+ * 1) No memory allocations are performed while holding one of the z_hold_locks.
+ * This ensures evict(), which can be called from direct memory reclaim, will
+ * never block waiting on a z_hold_locks which just happens to have hashed
+ * to the same index.
+ *
+ * 2) All locks used to serialize access to an object are per-object and never
+ * shared. This minimizes lock contention without creating a large number
+ * of dedicated locks.
+ *
+ * On the downside it does require znode_lock_t structures to be frequently
+ * allocated and freed. However, because these are backed by a kmem cache
+ * and very short lived this cost is minimal.
+ */
+int
+zfs_znode_hold_compare(const void *a, const void *b)
+{
+ const znode_hold_t *zh_a = (const znode_hold_t *)a;
+ const znode_hold_t *zh_b = (const znode_hold_t *)b;
+
+ return (AVL_CMP(zh_a->zh_obj, zh_b->zh_obj));
+}
+
+boolean_t
+zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+ znode_hold_t *zh, search;
+ int i = ZFS_OBJ_HASH(zfsvfs, obj);
+ boolean_t held;
+
+ search.zh_obj = obj;
+
+ mutex_enter(&zfsvfs->z_hold_locks[i]);
+ zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+ held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
+ mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+ return (held);
+}
+
+static znode_hold_t *
+zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+ znode_hold_t *zh, *zh_new, search;
+ int i = ZFS_OBJ_HASH(zfsvfs, obj);
+ boolean_t found = B_FALSE;
+
+ zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
+ zh_new->zh_obj = obj;
+ search.zh_obj = obj;
+
+ mutex_enter(&zfsvfs->z_hold_locks[i]);
+ zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+ if (likely(zh == NULL)) {
+ zh = zh_new;
+ avl_add(&zfsvfs->z_hold_trees[i], zh);
+ } else {
+ ASSERT3U(zh->zh_obj, ==, obj);
+ found = B_TRUE;
+ }
+ zfs_refcount_add(&zh->zh_refcount, NULL);
+ mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+ if (found == B_TRUE)
+ kmem_cache_free(znode_hold_cache, zh_new);
+
+ ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
+ ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+ mutex_enter(&zh->zh_lock);
+
+ return (zh);
+}
+
+static void
+zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
+{
+ int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
+ boolean_t remove = B_FALSE;
+
+ ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
+ ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+ mutex_exit(&zh->zh_lock);
+
+ mutex_enter(&zfsvfs->z_hold_locks[i]);
+ if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
+ avl_remove(&zfsvfs->z_hold_trees[i], zh);
+ remove = B_TRUE;
+ }
+ mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+ if (remove == B_TRUE)
+ kmem_cache_free(znode_hold_cache, zh);
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+ dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+ ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
+
+ mutex_enter(&zp->z_lock);
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ ASSERT(zp->z_acl_cached == NULL);
+ if (sa_hdl == NULL) {
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+ SA_HDL_SHARED, &zp->z_sa_hdl));
+ } else {
+ zp->z_sa_hdl = sa_hdl;
+ sa_set_userp(sa_hdl, zp);
+ }
+
+ zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+ mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+ ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
+ RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
+
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+}
+
+/*
+ * Called by new_inode() to allocate a new inode.
+ */
+int
+zfs_inode_alloc(struct super_block *sb, struct inode **ip)
+{
+ znode_t *zp;
+
+ zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ *ip = ZTOI(zp);
+
+ return (0);
+}
+
+/*
+ * Called in multiple places when an inode should be destroyed.
+ */
+void
+zfs_inode_destroy(struct inode *ip)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ if (list_link_active(&zp->z_link_node)) {
+ list_remove(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes--;
+ }
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+
+ if (zp->z_xattr_cached) {
+ nvlist_free(zp->z_xattr_cached);
+ zp->z_xattr_cached = NULL;
+ }
+
+ kmem_cache_free(znode_cache, zp);
+}
+
+static void
+zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
+{
+ uint64_t rdev = 0;
+
+ switch (ip->i_mode & S_IFMT) {
+ case S_IFREG:
+ ip->i_op = &zpl_inode_operations;
+ ip->i_fop = &zpl_file_operations;
+ ip->i_mapping->a_ops = &zpl_address_space_operations;
+ break;
+
+ case S_IFDIR:
+ ip->i_op = &zpl_dir_inode_operations;
+ ip->i_fop = &zpl_dir_file_operations;
+ ITOZ(ip)->z_zn_prefetch = B_TRUE;
+ break;
+
+ case S_IFLNK:
+ ip->i_op = &zpl_symlink_inode_operations;
+ break;
+
+ /*
+ * rdev is only stored in a SA only for device files.
+ */
+ case S_IFCHR:
+ case S_IFBLK:
+ (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
+ sizeof (rdev));
+ /*FALLTHROUGH*/
+ case S_IFIFO:
+ case S_IFSOCK:
+ init_special_inode(ip, ip->i_mode, rdev);
+ ip->i_op = &zpl_special_inode_operations;
+ break;
+
+ default:
+ zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
+ (u_longlong_t)ip->i_ino, ip->i_mode);
+
+ /* Assume the inode is a file and attempt to continue */
+ ip->i_mode = S_IFREG | 0644;
+ ip->i_op = &zpl_inode_operations;
+ ip->i_fop = &zpl_file_operations;
+ ip->i_mapping->a_ops = &zpl_address_space_operations;
+ break;
+ }
+}
+
+void
+zfs_set_inode_flags(znode_t *zp, struct inode *ip)
+{
+ /*
+ * Linux and Solaris have different sets of file attributes, so we
+ * restrict this conversion to the intersection of the two.
+ */
+#ifdef HAVE_INODE_SET_FLAGS
+ unsigned int flags = 0;
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ flags |= S_IMMUTABLE;
+ if (zp->z_pflags & ZFS_APPENDONLY)
+ flags |= S_APPEND;
+
+ inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
+#else
+ if (zp->z_pflags & ZFS_IMMUTABLE)
+ ip->i_flags |= S_IMMUTABLE;
+ else
+ ip->i_flags &= ~S_IMMUTABLE;
+
+ if (zp->z_pflags & ZFS_APPENDONLY)
+ ip->i_flags |= S_APPEND;
+ else
+ ip->i_flags &= ~S_APPEND;
+#endif
+}
+
+/*
+ * Update the embedded inode given the znode. We should work toward
+ * eliminating this function as soon as possible by removing values
+ * which are duplicated between the znode and inode. If the generic
+ * inode has the correct field it should be used, and the ZFS code
+ * updated to access the inode. This can be done incrementally.
+ */
+void
+zfs_inode_update(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs;
+ struct inode *ip;
+ uint32_t blksize;
+ u_longlong_t i_blocks;
+
+ ASSERT(zp != NULL);
+ zfsvfs = ZTOZSB(zp);
+ ip = ZTOI(zp);
+
+ /* Skip .zfs control nodes which do not exist on disk. */
+ if (zfsctl_is_node(ip))
+ return;
+
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
+
+ spin_lock(&ip->i_lock);
+ ip->i_blocks = i_blocks;
+ i_size_write(ip, zp->z_size);
+ spin_unlock(&ip->i_lock);
+}
+
+
+/*
+ * Construct a znode+inode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+ dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+ znode_t *zp;
+ struct inode *ip;
+ uint64_t mode;
+ uint64_t parent;
+ uint64_t tmp_gen;
+ uint64_t links;
+ uint64_t z_uid, z_gid;
+ uint64_t atime[2], mtime[2], ctime[2];
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ sa_bulk_attr_t bulk[11];
+ int count = 0;
+
+ ASSERT(zfsvfs != NULL);
+
+ ip = new_inode(zfsvfs->z_sb);
+ if (ip == NULL)
+ return (NULL);
+
+ zp = ITOZ(ip);
+ ASSERT(zp->z_dirlocks == NULL);
+ ASSERT3P(zp->z_acl_cached, ==, NULL);
+ ASSERT3P(zp->z_xattr_cached, ==, NULL);
+ zp->z_unlinked = B_FALSE;
+ zp->z_atime_dirty = B_FALSE;
+ zp->z_moved = B_FALSE;
+ zp->z_is_mapped = B_FALSE;
+ zp->z_is_ctldir = B_FALSE;
+ zp->z_is_stale = B_FALSE;
+ zp->z_suspended = B_FALSE;
+ zp->z_sa_hdl = NULL;
+ zp->z_mapcnt = 0;
+ zp->z_id = db->db_object;
+ zp->z_blksz = blksz;
+ zp->z_seq = 0x7A4653;
+ zp->z_sync_cnt = 0;
+
+ zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+ &parent, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
+ (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ (zp->z_pflags & ZFS_PROJID) &&
+ sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+ if (hdl == NULL)
+ sa_handle_destroy(zp->z_sa_hdl);
+ zp->z_sa_hdl = NULL;
+ goto error;
+ }
+
+ zp->z_projid = projid;
+ zp->z_mode = ip->i_mode = mode;
+ ip->i_generation = (uint32_t)tmp_gen;
+ ip->i_blkbits = SPA_MINBLOCKSHIFT;
+ set_nlink(ip, (uint32_t)links);
+ zfs_uid_write(ip, z_uid);
+ zfs_gid_write(ip, z_gid);
+ zfs_set_inode_flags(zp, ip);
+
+ /* Cache the xattr parent id */
+ if (zp->z_pflags & ZFS_XATTR)
+ zp->z_xattr_parent = parent;
+
+ ZFS_TIME_DECODE(&ip->i_atime, atime);
+ ZFS_TIME_DECODE(&ip->i_mtime, mtime);
+ ZFS_TIME_DECODE(&ip->i_ctime, ctime);
+
+ ip->i_ino = zp->z_id;
+ zfs_inode_update(zp);
+ zfs_inode_set_ops(zfsvfs, ip);
+
+ /*
+ * The only way insert_inode_locked() can fail is if the ip->i_ino
+ * number is already hashed for this super block. This can never
+ * happen because the inode numbers map 1:1 with the object numbers.
+ *
+ * The one exception is rolling back a mounted file system, but in
+ * this case all the active inode are unhashed during the rollback.
+ */
+ VERIFY3S(insert_inode_locked(ip), ==, 0);
+
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ list_insert_tail(&zfsvfs->z_all_znodes, zp);
+ zfsvfs->z_nr_znodes++;
+ membar_producer();
+ mutex_exit(&zfsvfs->z_znodes_lock);
+
+ unlock_new_inode(ip);
+ return (zp);
+
+error:
+ iput(ip);
+ return (NULL);
+}
+
+/*
+ * Safely mark an inode dirty. Inodes which are part of a read-only
+ * file system or snapshot may not be dirtied.
+ */
+void
+zfs_mark_inode_dirty(struct inode *ip)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+ return;
+
+ mark_inode_dirty(ip);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ * IN: dzp - parent directory for new znode
+ * vap - file attributes for new znode
+ * tx - dmu transaction id for zap operations
+ * cr - credentials of caller
+ * flag - flags:
+ * IS_ROOT_NODE - new object will be root
+ * IS_TMPFILE - new object is of O_TMPFILE
+ * IS_XATTR - new object is an attribute
+ * acl_ids - ACL related attributes
+ *
+ * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE)
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+ uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+ uint64_t crtime[2], atime[2], mtime[2], ctime[2];
+ uint64_t mode, size, links, parent, pflags;
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ uint64_t rdev = 0;
+ zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+ dmu_buf_t *db;
+ inode_timespec_t now;
+ uint64_t gen, obj;
+ int bonuslen;
+ int dnodesize;
+ sa_handle_t *sa_hdl;
+ dmu_object_type_t obj_type;
+ sa_bulk_attr_t *sa_attrs;
+ int cnt = 0;
+ zfs_acl_locator_cb_t locate = { 0 };
+ znode_hold_t *zh;
+
+ if (zfsvfs->z_replay) {
+ obj = vap->va_nodeid;
+ now = vap->va_ctime; /* see zfs_replay_create() */
+ gen = vap->va_nblocks; /* ditto */
+ dnodesize = vap->va_fsid; /* ditto */
+ } else {
+ obj = 0;
+ gethrestime(&now);
+ gen = dmu_tx_get_txg(tx);
+ dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+ }
+
+ if (dnodesize == 0)
+ dnodesize = DNODE_MIN_SIZE;
+
+ obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+
+ bonuslen = (obj_type == DMU_OT_SA) ?
+ DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+ /*
+ * Create a new DMU object.
+ */
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be needed to allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
+ if (S_ISDIR(vap->va_mode)) {
+ if (zfsvfs->z_replay) {
+ VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = zap_create_norm_dnsize(zfsvfs->z_os,
+ zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ } else {
+ if (zfsvfs->z_replay) {
+ VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx));
+ } else {
+ obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+ DMU_OT_PLAIN_FILE_CONTENTS, 0,
+ obj_type, bonuslen, dnodesize, tx);
+ }
+ }
+
+ zh = zfs_znode_hold_enter(zfsvfs, obj);
+ VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+ /*
+ * If this is the root, fix up the half-initialized parent pointer
+ * to reference the just-allocated physical data area.
+ */
+ if (flag & IS_ROOT_NODE) {
+ dzp->z_id = obj;
+ }
+
+ /*
+ * If parent is an xattr, so am I.
+ */
+ if (dzp->z_pflags & ZFS_XATTR) {
+ flag |= IS_XATTR;
+ }
+
+ if (zfsvfs->z_use_fuids)
+ pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+ else
+ pflags = 0;
+
+ if (S_ISDIR(vap->va_mode)) {
+ size = 2; /* contents ("." and "..") */
+ links = 2;
+ } else {
+ size = 0;
+ links = (flag & IS_TMPFILE) ? 0 : 1;
+ }
+
+ if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
+ rdev = vap->va_rdev;
+
+ parent = dzp->z_id;
+ mode = acl_ids->z_mode;
+ if (flag & IS_XATTR)
+ pflags |= ZFS_XATTR;
+
+ if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
+ /*
+ * With ZFS_PROJID flag, we can easily know whether there is
+ * project ID stored on disk or not. See zfs_space_delta_cb().
+ */
+ if (obj_type != DMU_OT_ZNODE &&
+ dmu_objset_projectquota_enabled(zfsvfs->z_os))
+ pflags |= ZFS_PROJID;
+
+ /*
+ * Inherit project ID from parent if required.
+ */
+ projid = zfs_inherit_projid(dzp);
+ if (dzp->z_pflags & ZFS_PROJINHERIT)
+ pflags |= ZFS_PROJINHERIT;
+ }
+
+ /*
+ * No execs denied will be determined when zfs_mode_compute() is called.
+ */
+ pflags |= acl_ids->z_aclp->z_hints &
+ (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+ ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+ ZFS_TIME_ENCODE(&now, crtime);
+ ZFS_TIME_ENCODE(&now, ctime);
+
+ if (vap->va_mask & ATTR_ATIME) {
+ ZFS_TIME_ENCODE(&vap->va_atime, atime);
+ } else {
+ ZFS_TIME_ENCODE(&now, atime);
+ }
+
+ if (vap->va_mask & ATTR_MTIME) {
+ ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+ } else {
+ ZFS_TIME_ENCODE(&now, mtime);
+ }
+
+ /* Now add in all of the "SA" attributes */
+ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+ &sa_hdl));
+
+ /*
+ * Setup the array of attributes to be replaced/set on the new file
+ *
+ * order for DMU_OT_ZNODE is critical since it needs to be constructed
+ * in the old znode_phys_t format. Don't change this ordering
+ */
+ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ } else {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+ NULL, &mode, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+ NULL, &size, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+ NULL, &gen, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+ NULL, &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+ NULL, &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+ NULL, &parent, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+ NULL, &atime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+ NULL, &mtime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+ NULL, &ctime, 16);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+ NULL, &crtime, 16);
+ }
+
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+ &empty_xattr, 8);
+ } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+ pflags & ZFS_PROJID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
+ NULL, &projid, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE ||
+ (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+ NULL, &rdev, 8);
+ }
+ if (obj_type == DMU_OT_ZNODE) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &pflags, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+ &acl_ids->z_fuid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+ &acl_ids->z_fgid, 8);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+ sizeof (uint64_t) * 4);
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+ &acl_phys, sizeof (zfs_acl_phys_t));
+ } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+ &acl_ids->z_aclp->z_acl_count, 8);
+ locate.cb_aclp = acl_ids->z_aclp;
+ SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+ zfs_acl_data_locator, &locate,
+ acl_ids->z_aclp->z_acl_bytes);
+ mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+ acl_ids->z_fuid, acl_ids->z_fgid);
+ }
+
+ VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+ if (!(flag & IS_ROOT_NODE)) {
+ /*
+ * The call to zfs_znode_alloc() may fail if memory is low
+ * via the call path: alloc_inode() -> inode_init_always() ->
+ * security_inode_alloc() -> inode_alloc_security(). Since
+ * the existing code is written such that zfs_mknode() can
+ * not fail retry until sufficient memory has been reclaimed.
+ */
+ do {
+ *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+ } while (*zpp == NULL);
+
+ VERIFY(*zpp != NULL);
+ VERIFY(dzp != NULL);
+ } else {
+ /*
+ * If we are creating the root node, the "parent" we
+ * passed in is the znode for the root.
+ */
+ *zpp = dzp;
+
+ (*zpp)->z_sa_hdl = sa_hdl;
+ }
+
+ (*zpp)->z_pflags = pflags;
+ (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
+ (*zpp)->z_dnodesize = dnodesize;
+ (*zpp)->z_projid = projid;
+
+ if (obj_type == DMU_OT_ZNODE ||
+ acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+ VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+ }
+ kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+ zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+/*
+ * Update in-core attributes. It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+ xoptattr_t *xoap;
+ boolean_t update_inode = B_FALSE;
+
+ xoap = xva_getxoptattr(xvap);
+ ASSERT(xoap);
+
+ if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+ uint64_t times[2];
+ ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+ (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+ &times, sizeof (times), tx);
+ XVA_SET_RTN(xvap, XAT_CREATETIME);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_READONLY);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+ ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_HIDDEN);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+ ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SYSTEM);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+ ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_ARCHIVE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+ ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+
+ update_inode = B_TRUE;
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+ ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NOUNLINK);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+ ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_APPENDONLY);
+
+ update_inode = B_TRUE;
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+ ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_NODUMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+ ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OPAQUE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+ xoap->xoa_av_quarantined, zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+ ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+ zfs_sa_set_scanstamp(zp, xvap, tx);
+ XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_REPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+ ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_OFFLINE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+ ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_SPARSE);
+ }
+ if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+ ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
+ zp->z_pflags, tx);
+ XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+ }
+
+ if (update_inode)
+ zfs_set_inode_flags(zp, ZTOI(zp));
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ znode_t *zp;
+ znode_hold_t *zh;
+ int err;
+ sa_handle_t *hdl;
+
+ *zpp = NULL;
+
+again:
+ zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EINVAL));
+ }
+
+ hdl = dmu_buf_get_user(db);
+ if (hdl != NULL) {
+ zp = sa_get_userdata(hdl);
+
+
+ /*
+ * Since "SA" does immediate eviction we
+ * should never find a sa handle that doesn't
+ * know about the znode.
+ */
+
+ ASSERT3P(zp, !=, NULL);
+
+ mutex_enter(&zp->z_lock);
+ ASSERT3U(zp->z_id, ==, obj_num);
+ /*
+ * If igrab() returns NULL the VFS has independently
+ * determined the inode should be evicted and has
+ * called iput_final() to start the eviction process.
+ * The SA handle is still valid but because the VFS
+ * requires that the eviction succeed we must drop
+ * our locks and references to allow the eviction to
+ * complete. The zfs_zget() may then be retried.
+ *
+ * This unlikely case could be optimized by registering
+ * a sops->drop_inode() callback. The callback would
+ * need to detect the active SA hold thereby informing
+ * the VFS that this inode should not be evicted.
+ */
+ if (igrab(ZTOI(zp)) == NULL) {
+ mutex_exit(&zp->z_lock);
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ /* inode might need this to finish evict */
+ cond_resched();
+ goto again;
+ }
+ *zpp = zp;
+ err = 0;
+ mutex_exit(&zp->z_lock);
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+ }
+
+ /*
+ * Not found create new znode/vnode but only if file exists.
+ *
+ * There is a small window where zfs_vget() could
+ * find this object while a file create is still in
+ * progress. This is checked for in zfs_znode_alloc()
+ *
+ * if zfs_znode_alloc() fails it will drop the hold on the
+ * bonus buffer.
+ */
+ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+ doi.doi_bonus_type, NULL);
+ if (zp == NULL) {
+ err = SET_ERROR(ENOENT);
+ } else {
+ *zpp = zp;
+ }
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_object_info_t doi;
+ dmu_buf_t *db;
+ uint64_t obj_num = zp->z_id;
+ uint64_t mode;
+ uint64_t links;
+ sa_bulk_attr_t bulk[10];
+ int err;
+ int count = 0;
+ uint64_t gen;
+ uint64_t z_uid, z_gid;
+ uint64_t atime[2], mtime[2], ctime[2];
+ uint64_t projid = ZFS_DEFAULT_PROJID;
+ znode_hold_t *zh;
+
+ /*
+ * skip ctldir, otherwise they will always get invalidated. This will
+ * cause funny behaviour for the mounted snapdirs. Especially for
+ * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
+ * anyone automount it again as long as someone is still using the
+ * detached mount.
+ */
+ if (zp->z_is_ctldir)
+ return (0);
+
+ zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+ mutex_enter(&zp->z_acl_lock);
+ if (zp->z_acl_cached) {
+ zfs_acl_free(zp->z_acl_cached);
+ zp->z_acl_cached = NULL;
+ }
+ mutex_exit(&zp->z_acl_lock);
+
+ rw_enter(&zp->z_xattr_lock, RW_WRITER);
+ if (zp->z_xattr_cached) {
+ nvlist_free(zp->z_xattr_cached);
+ zp->z_xattr_cached = NULL;
+ }
+ rw_exit(&zp->z_xattr_lock);
+
+ ASSERT(zp->z_sa_hdl == NULL);
+ err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (err);
+ }
+
+ dmu_object_info_from_db(db, &doi);
+ if (doi.doi_bonus_type != DMU_OT_SA &&
+ (doi.doi_bonus_type != DMU_OT_ZNODE ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+ sa_buf_rele(db, NULL);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EINVAL));
+ }
+
+ zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+
+ /* reload cached values */
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+ &gen, sizeof (gen));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+ &zp->z_size, sizeof (zp->z_size));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+ &links, sizeof (links));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, sizeof (zp->z_pflags));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+ &z_uid, sizeof (z_uid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+ &z_gid, sizeof (z_gid));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+ &mode, sizeof (mode));
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+ &atime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+
+ if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EIO));
+ }
+
+ if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
+ err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
+ &projid, 8);
+ if (err != 0 && err != ENOENT) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(err));
+ }
+ }
+
+ zp->z_projid = projid;
+ zp->z_mode = ZTOI(zp)->i_mode = mode;
+ zfs_uid_write(ZTOI(zp), z_uid);
+ zfs_gid_write(ZTOI(zp), z_gid);
+
+ ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
+ ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
+ ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
+
+ if ((uint32_t)gen != ZTOI(zp)->i_generation) {
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ return (SET_ERROR(EIO));
+ }
+
+ set_nlink(ZTOI(zp), (uint32_t)links);
+ zfs_set_inode_flags(zp, ZTOI(zp));
+
+ zp->z_blksz = doi.doi_data_block_size;
+ zp->z_atime_dirty = B_FALSE;
+ zfs_inode_update(zp);
+
+ /*
+ * If the file has zero links, then it has been unlinked on the send
+ * side and it must be in the received unlinked set.
+ * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+ * stale data and to prevent automatic removal of the file in
+ * zfs_zinactive(). The file will be removed either when it is removed
+ * on the send side and the next incremental stream is received or
+ * when the unlinked set gets processed.
+ */
+ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
+ if (zp->z_unlinked)
+ zfs_znode_dmu_fini(zp);
+
+ zfs_znode_hold_exit(zfsvfs, zh);
+
+ return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ objset_t *os = zfsvfs->z_os;
+ uint64_t obj = zp->z_id;
+ uint64_t acl_obj = zfs_external_acl(zp);
+ znode_hold_t *zh;
+
+ zh = zfs_znode_hold_enter(zfsvfs, obj);
+ if (acl_obj) {
+ VERIFY(!zp->z_is_sa);
+ VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+ }
+ VERIFY(0 == dmu_object_free(os, obj, tx));
+ zfs_znode_dmu_fini(zp);
+ zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ uint64_t z_id = zp->z_id;
+ znode_hold_t *zh;
+
+ ASSERT(zp->z_sa_hdl);
+
+ /*
+ * Don't allow a zfs_zget() while were trying to release this znode.
+ */
+ zh = zfs_znode_hold_enter(zfsvfs, z_id);
+
+ mutex_enter(&zp->z_lock);
+
+ /*
+ * If this was the last reference to a file with no links, remove
+ * the file from the file system unless the file system is mounted
+ * read-only. That can happen, for example, if the file system was
+ * originally read-write, the file was opened, then unlinked and
+ * the file system was made read-only before the file was finally
+ * closed. The file will remain in the unlinked set.
+ */
+ if (zp->z_unlinked) {
+ ASSERT(!zfsvfs->z_issnap);
+ if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_hold_exit(zfsvfs, zh);
+ zfs_rmnode(zp);
+ return;
+ }
+ }
+
+ mutex_exit(&zp->z_lock);
+ zfs_znode_dmu_fini(zp);
+
+ zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+#if defined(HAVE_INODE_TIMESPEC64_TIMES)
+#define zfs_compare_timespec timespec64_compare
+#else
+#define zfs_compare_timespec timespec_compare
+#endif
+
+/*
+ * Determine whether the znode's atime must be updated. The logic mostly
+ * duplicates the Linux kernel's relatime_need_update() functionality.
+ * This function is only called if the underlying filesystem actually has
+ * atime updates enabled.
+ */
+boolean_t
+zfs_relatime_need_update(const struct inode *ip)
+{
+ inode_timespec_t now;
+
+ gethrestime(&now);
+ /*
+ * In relatime mode, only update the atime if the previous atime
+ * is earlier than either the ctime or mtime or if at least a day
+ * has passed since the last update of atime.
+ */
+ if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
+ return (B_TRUE);
+
+ if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
+ return (B_TRUE);
+
+ if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
+ return (B_TRUE);
+
+ return (B_FALSE);
+}
+
+/*
+ * Prepare to update znode time stamps.
+ *
+ * IN: zp - znode requiring timestamp update
+ * flag - ATTR_MTIME, ATTR_CTIME flags
+ *
+ * OUT: zp - z_seq
+ * mtime - new mtime
+ * ctime - new ctime
+ *
+ * Note: We don't update atime here, because we rely on Linux VFS to do
+ * atime updating.
+ */
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+ uint64_t ctime[2])
+{
+ inode_timespec_t now;
+
+ gethrestime(&now);
+
+ zp->z_seq++;
+
+ if (flag & ATTR_MTIME) {
+ ZFS_TIME_ENCODE(&now, mtime);
+ ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
+ if (ZTOZSB(zp)->z_use_fuids) {
+ zp->z_pflags |= (ZFS_ARCHIVE |
+ ZFS_AV_MODIFIED);
+ }
+ }
+
+ if (flag & ATTR_CTIME) {
+ ZFS_TIME_ENCODE(&now, ctime);
+ ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
+ if (ZTOZSB(zp)->z_use_fuids)
+ zp->z_pflags |= ZFS_ARCHIVE;
+ }
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * size - requested block size
+ * tx - open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+ int error;
+ u_longlong_t dummy;
+
+ if (size <= zp->z_blksz)
+ return;
+ /*
+ * If the file size is already greater than the current blocksize,
+ * we will not grow. If there is more than one block in a file,
+ * the blocksize cannot change.
+ */
+ if (zp->z_blksz && zp->z_size > zp->z_blksz)
+ return;
+
+ error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
+ size, 0, tx);
+
+ if (error == ENOTSUP)
+ return;
+ ASSERT0(error);
+
+ /* What blocksize did we actually get? */
+ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_tx_t *tx;
+ locked_range_t *lr;
+ uint64_t newblksz;
+ int error;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end <= zp->z_size) {
+ rangelock_exit(lr);
+ return (0);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ if (end > zp->z_blksz &&
+ (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+ /*
+ * We are growing the file past the current block size.
+ */
+ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
+ ASSERT(!ISP2(zp->z_blksz));
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+ } else {
+ newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
+ }
+ dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+ } else {
+ newblksz = 0;
+ }
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ rangelock_exit(lr);
+ return (error);
+ }
+
+ if (newblksz)
+ zfs_grow_blocksize(zp, newblksz, tx);
+
+ zp->z_size = end;
+
+ VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+ &zp->z_size, sizeof (zp->z_size), tx));
+
+ rangelock_exit(lr);
+
+ dmu_tx_commit(tx);
+
+ return (0);
+}
+
+/*
+ * zfs_zero_partial_page - Modeled after update_pages() but
+ * with different arguments and semantics for use by zfs_freesp().
+ *
+ * Zeroes a piece of a single page cache entry for zp at offset
+ * start and length len.
+ *
+ * Caller must acquire a range lock on the file for the region
+ * being zeroed in order that the ARC and page cache stay in sync.
+ */
+static void
+zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
+{
+ struct address_space *mp = ZTOI(zp)->i_mapping;
+ struct page *pp;
+ int64_t off;
+ void *pb;
+
+ ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
+
+ off = start & (PAGE_SIZE - 1);
+ start &= PAGE_MASK;
+
+ pp = find_lock_page(mp, start >> PAGE_SHIFT);
+ if (pp) {
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ pb = kmap(pp);
+ bzero(pb + off, len);
+ kunmap(pp);
+
+ if (mapping_writably_mapped(mp))
+ flush_dcache_page(pp);
+
+ mark_page_accessed(pp);
+ SetPageUptodate(pp);
+ ClearPageError(pp);
+ unlock_page(pp);
+ put_page(pp);
+ }
+}
+
+/*
+ * Free space in a file.
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of section to free.
+ * len - length of section to free.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ locked_range_t *lr;
+ int error;
+
+ /*
+ * Lock the range being freed.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (off >= zp->z_size) {
+ rangelock_exit(lr);
+ return (0);
+ }
+
+ if (off + len > zp->z_size)
+ len = zp->z_size - off;
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+ /*
+ * Zero partial page cache entries. This must be done under a
+ * range lock in order to keep the ARC and page cache in sync.
+ */
+ if (zp->z_is_mapped) {
+ loff_t first_page, last_page, page_len;
+ loff_t first_page_offset, last_page_offset;
+
+ /* first possible full page in hole */
+ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ /* last page of hole */
+ last_page = (off + len) >> PAGE_SHIFT;
+
+ /* offset of first_page */
+ first_page_offset = first_page << PAGE_SHIFT;
+ /* offset of last_page */
+ last_page_offset = last_page << PAGE_SHIFT;
+
+ /* truncate whole pages */
+ if (last_page_offset > first_page_offset) {
+ truncate_inode_pages_range(ZTOI(zp)->i_mapping,
+ first_page_offset, last_page_offset - 1);
+ }
+
+ /* truncate sub-page ranges */
+ if (first_page > last_page) {
+ /* entire punched area within a single page */
+ zfs_zero_partial_page(zp, off, len);
+ } else {
+ /* beginning of punched area at the end of a page */
+ page_len = first_page_offset - off;
+ if (page_len > 0)
+ zfs_zero_partial_page(zp, off, page_len);
+
+ /* end of punched area at the beginning of a page */
+ page_len = off + len - last_page_offset;
+ if (page_len > 0)
+ zfs_zero_partial_page(zp, last_page_offset,
+ page_len);
+ }
+ }
+ rangelock_exit(lr);
+
+ return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ * IN: zp - znode of file to free data in.
+ * end - new end-of-file.
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ dmu_tx_t *tx;
+ locked_range_t *lr;
+ int error;
+ sa_bulk_attr_t bulk[2];
+ int count = 0;
+
+ /*
+ * We will change zp_size, lock the whole file.
+ */
+ lr = rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+ /*
+ * Nothing to do if file already at desired length.
+ */
+ if (end >= zp->z_size) {
+ rangelock_exit(lr);
+ return (0);
+ }
+
+ error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+ DMU_OBJECT_END);
+ if (error) {
+ rangelock_exit(lr);
+ return (error);
+ }
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ rangelock_exit(lr);
+ return (error);
+ }
+
+ zp->z_size = end;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+ NULL, &zp->z_size, sizeof (zp->z_size));
+
+ if (end == 0) {
+ zp->z_pflags &= ~ZFS_SPARSE;
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ }
+ VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+ dmu_tx_commit(tx);
+ rangelock_exit(lr);
+
+ return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ * IN: zp - znode of file to free data in.
+ * off - start of range
+ * len - end of range (0 => EOF)
+ * flag - current file open mode flags.
+ * log - TRUE if this action should be logged
+ *
+ * RETURN: 0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+ dmu_tx_t *tx;
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ zilog_t *zilog = zfsvfs->z_log;
+ uint64_t mode;
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+ int error;
+
+ if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+ sizeof (mode))) != 0)
+ return (error);
+
+ if (off > zp->z_size) {
+ error = zfs_extend(zp, off+len);
+ if (error == 0 && log)
+ goto log;
+ goto out;
+ }
+
+ if (len == 0) {
+ error = zfs_trunc(zp, off);
+ } else {
+ if ((error = zfs_free_range(zp, off, len)) == 0 &&
+ off + len > zp->z_size)
+ error = zfs_extend(zp, off+len);
+ }
+ if (error || !log)
+ goto out;
+log:
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+ NULL, &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ ASSERT(error == 0);
+
+ zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+ dmu_tx_commit(tx);
+
+ zfs_inode_update(zp);
+ error = 0;
+
+out:
+ /*
+ * Truncate the page cache - for file truncate operations, use
+ * the purpose-built API for truncations. For punching operations,
+ * the truncation is handled under a range lock in zfs_free_range.
+ */
+ if (len == 0)
+ truncate_setsize(ZTOI(zp), off);
+ return (error);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+ struct super_block *sb;
+ zfsvfs_t *zfsvfs;
+ uint64_t moid, obj, sa_obj, version;
+ uint64_t sense = ZFS_CASE_SENSITIVE;
+ uint64_t norm = 0;
+ nvpair_t *elem;
+ int size;
+ int error;
+ int i;
+ znode_t *rootzp = NULL;
+ vattr_t vattr;
+ znode_t *zp;
+ zfs_acl_ids_t acl_ids;
+
+ /*
+ * First attempt to create master node.
+ */
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
+ moid = MASTER_NODE_OBJ;
+ error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Set starting attributes.
+ */
+ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+ elem = NULL;
+ while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+ /* For the moment we expect all zpl props to be uint64_ts */
+ uint64_t val;
+ char *name;
+
+ ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+ VERIFY(nvpair_value_uint64(elem, &val) == 0);
+ name = nvpair_name(elem);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+ if (val < version)
+ version = val;
+ } else {
+ error = zap_update(os, moid, name, 8, 1, &val, tx);
+ }
+ ASSERT(error == 0);
+ if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+ norm = val;
+ else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+ sense = val;
+ }
+ ASSERT(version != 0);
+ error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+ /*
+ * Create zap object used for SA attribute registration
+ */
+
+ if (version >= ZPL_VERSION_SA) {
+ sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+ DMU_OT_NONE, 0, tx);
+ error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+ ASSERT(error == 0);
+ } else {
+ sa_obj = 0;
+ }
+ /*
+ * Create a delete queue.
+ */
+ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+ error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+ ASSERT(error == 0);
+
+ /*
+ * Create root znode. Create minimal znode/inode/zfsvfs/sb
+ * to allow zfs_mknode to work.
+ */
+ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
+ vattr.va_mode = S_IFDIR|0755;
+ vattr.va_uid = crgetuid(cr);
+ vattr.va_gid = crgetgid(cr);
+
+ rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+ rootzp->z_unlinked = B_FALSE;
+ rootzp->z_atime_dirty = B_FALSE;
+ rootzp->z_moved = B_FALSE;
+ rootzp->z_is_sa = USE_SA(version, os);
+ rootzp->z_pflags = 0;
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+ zfsvfs->z_os = os;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_version = version;
+ zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs->z_use_sa = USE_SA(version, os);
+ zfsvfs->z_norm = norm;
+
+ sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
+ sb->s_fs_info = zfsvfs;
+
+ ZTOI(rootzp)->i_sb = sb;
+
+ error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+ &zfsvfs->z_attr_table);
+
+ ASSERT(error == 0);
+
+ /*
+ * Fold case on file systems that are always or sometimes case
+ * insensitive.
+ */
+ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+ offsetof(znode_t, z_link_node));
+
+ size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
+ zfsvfs->z_hold_size = size;
+ zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+ KM_SLEEP);
+ zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+ for (i = 0; i != size; i++) {
+ avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+ sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+ mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+ }
+
+ VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+ cr, NULL, &acl_ids));
+ zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+ ASSERT3P(zp, ==, rootzp);
+ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+ ASSERT(error == 0);
+ zfs_acl_ids_free(&acl_ids);
+
+ atomic_set(&ZTOI(rootzp)->i_count, 0);
+ sa_handle_destroy(rootzp->z_sa_hdl);
+ kmem_cache_free(znode_cache, rootzp);
+
+ for (i = 0; i != size; i++) {
+ avl_destroy(&zfsvfs->z_hold_trees[i]);
+ mutex_destroy(&zfsvfs->z_hold_locks[i]);
+ }
+
+ mutex_destroy(&zfsvfs->z_znodes_lock);
+
+ vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+ vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+ kmem_free(sb, sizeof (struct super_block));
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+ uint64_t sa_obj = 0;
+ int error;
+
+ error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+ if (error != 0 && error != ENOENT)
+ return (error);
+
+ error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+ return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+ dmu_buf_t **db, void *tag)
+{
+ dmu_object_info_t doi;
+ int error;
+
+ if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+ return (error);
+
+ dmu_object_info_from_db(*db, &doi);
+ if ((doi.doi_bonus_type != DMU_OT_SA &&
+ doi.doi_bonus_type != DMU_OT_ZNODE) ||
+ (doi.doi_bonus_type == DMU_OT_ZNODE &&
+ doi.doi_bonus_size < sizeof (znode_phys_t))) {
+ sa_buf_rele(*db, tag);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+ if (error != 0) {
+ sa_buf_rele(*db, tag);
+ return (error);
+ }
+
+ return (0);
+}
+
+void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+ sa_handle_destroy(hdl);
+ sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ uint64_t *pobjp, int *is_xattrdir)
+{
+ uint64_t parent;
+ uint64_t pflags;
+ uint64_t mode;
+ uint64_t parent_mode;
+ sa_bulk_attr_t bulk[3];
+ sa_handle_t *sa_hdl;
+ dmu_buf_t *sa_db;
+ int count = 0;
+ int error;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+ &parent, sizeof (parent));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+ &pflags, sizeof (pflags));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &mode, sizeof (mode));
+
+ if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+ return (error);
+
+ /*
+ * When a link is removed its parent pointer is not changed and will
+ * be invalid. There are two cases where a link is removed but the
+ * file stays around, when it goes to the delete queue and when there
+ * are additional links.
+ */
+ error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ if (error != 0)
+ return (error);
+
+ *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+ /*
+ * Extended attributes can be applied to files, directories, etc.
+ * Otherwise the parent must be a directory.
+ */
+ if (!*is_xattrdir && !S_ISDIR(parent_mode))
+ return (SET_ERROR(EINVAL));
+
+ *pobjp = parent;
+
+ return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+ zfs_stat_t *sb)
+{
+ sa_bulk_attr_t bulk[4];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+ &sb->zs_mode, sizeof (sb->zs_mode));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+ &sb->zs_gen, sizeof (sb->zs_gen));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+ &sb->zs_links, sizeof (sb->zs_links));
+ SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+ &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+ return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+ sa_attr_type_t *sa_table, char *buf, int len)
+{
+ sa_handle_t *sa_hdl;
+ sa_handle_t *prevhdl = NULL;
+ dmu_buf_t *prevdb = NULL;
+ dmu_buf_t *sa_db = NULL;
+ char *path = buf + len - 1;
+ int error;
+
+ *path = '\0';
+ sa_hdl = hdl;
+
+ uint64_t deleteq_obj;
+ VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+ ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+ error = zap_lookup_int(osp, deleteq_obj, obj);
+ if (error == 0) {
+ return (ESTALE);
+ } else if (error != ENOENT) {
+ return (error);
+ }
+ error = 0;
+
+ for (;;) {
+ uint64_t pobj = 0;
+ char component[MAXNAMELEN + 2];
+ size_t complen;
+ int is_xattrdir = 0;
+
+ if (prevdb)
+ zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+
+ if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+ &is_xattrdir)) != 0)
+ break;
+
+ if (pobj == obj) {
+ if (path[0] != '/')
+ *--path = '/';
+ break;
+ }
+
+ component[0] = '/';
+ if (is_xattrdir) {
+ (void) sprintf(component + 1, "<xattrdir>");
+ } else {
+ error = zap_value_search(osp, pobj, obj,
+ ZFS_DIRENT_OBJ(-1ULL), component + 1);
+ if (error != 0)
+ break;
+ }
+
+ complen = strlen(component);
+ path -= complen;
+ ASSERT(path >= buf);
+ bcopy(component, path, complen);
+ obj = pobj;
+
+ if (sa_hdl != hdl) {
+ prevhdl = sa_hdl;
+ prevdb = sa_db;
+ }
+ error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+ if (error != 0) {
+ sa_hdl = prevhdl;
+ sa_db = prevdb;
+ break;
+ }
+ }
+
+ if (sa_hdl != NULL && sa_hdl != hdl) {
+ ASSERT(sa_db != NULL);
+ zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+ }
+
+ if (error == 0)
+ (void) memmove(buf, path, buf + len - path);
+
+ return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+ char *buf, int len)
+{
+ char *path = buf + len - 1;
+ sa_attr_type_t *sa_table;
+ sa_handle_t *hdl;
+ dmu_buf_t *db;
+ int error;
+
+ *path = '\0';
+
+ error = zfs_sa_setup(osp, &sa_table);
+ if (error != 0)
+ return (error);
+
+ error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+ if (error != 0)
+ return (error);
+
+ error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+ if (error != 0) {
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+ }
+
+ error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+ zfs_release_sa_handle(hdl, db, FTAG);
+ return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_create_fs);
+EXPORT_SYMBOL(zfs_obj_to_path);
+
+/* CSTYLED */
+module_param(zfs_object_mutex_size, uint, 0644);
+MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
+module_param(zfs_unlink_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
+"(debug - leaks space into the unlinked set)");
+#endif
diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c
new file mode 100644
index 000000000..486622c8a
--- /dev/null
+++ b/module/os/linux/zfs/zio_crypt.c
@@ -0,0 +1,2036 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+#include <sys/qat.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000
+#define ZFS_CURRENT_MAX_SALT_USES \
+ (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+typedef struct blkptr_auth_buf {
+ uint64_t bab_prop; /* blk_prop - portable mask */
+ uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */
+ uint64_t bab_pad; /* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+ {"", ZC_TYPE_NONE, 0, "inherit"},
+ {"", ZC_TYPE_NONE, 0, "on"},
+ {"", ZC_TYPE_NONE, 0, "off"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"},
+ {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"},
+ {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"}
+};
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+ rw_destroy(&key->zk_salt_lock);
+
+ /* free crypto templates */
+ crypto_destroy_ctx_template(key->zk_current_tmpl);
+ crypto_destroy_ctx_template(key->zk_hmac_tmpl);
+
+ /* zero out sensitive data */
+ bzero(key, sizeof (zio_crypt_key_t));
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ uint_t keydata_len;
+
+ ASSERT(key != NULL);
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+ bzero(key, sizeof (zio_crypt_key_t));
+
+ /* fill keydata buffers and salt with random data */
+ ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+ if (ret != 0)
+ goto error;
+
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for the ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ /*
+ * Initialize the crypto templates. It's ok if this fails because
+ * this is just an optimization.
+ */
+ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+ &key->zk_hmac_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_hmac_tmpl = NULL;
+
+ key->zk_crypt = crypt;
+ key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+ key->zk_salt_count = 0;
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy(key);
+ return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+ int ret = 0;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ crypto_mechanism_t mech;
+ uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+ /* generate a new salt */
+ ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+ /* someone beat us to the salt rotation, just unlock and return */
+ if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+ goto out_unlock;
+
+ /* derive the current key from the master key and the new salt */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+ if (ret != 0)
+ goto out_unlock;
+
+ /* assign the salt and reset the usage count */
+ bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+ key->zk_salt_count = 0;
+
+ /* destroy the old context template and create the new one */
+ crypto_destroy_ctx_template(key->zk_current_tmpl);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ rw_exit(&key->zk_salt_lock);
+
+ return (0);
+
+out_unlock:
+ rw_exit(&key->zk_salt_lock);
+error:
+ return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+ int ret;
+ boolean_t salt_change;
+
+ rw_enter(&key->zk_salt_lock, RW_READER);
+
+ bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+ salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+ ZFS_CURRENT_MAX_SALT_USES);
+
+ rw_exit(&key->zk_salt_lock);
+
+ if (salt_change) {
+ ret = zio_crypt_key_change_salt(key);
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+static int
+zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
+ crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
+ uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
+{
+ int ret;
+ crypto_data_t plaindata, cipherdata;
+ CK_AES_CCM_PARAMS ccmp;
+ CK_AES_GCM_PARAMS gcmp;
+ crypto_mechanism_t mech;
+ zio_crypt_info_t crypt_info;
+ uint_t plain_full_len, maclen;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW);
+
+ /* lookup the encryption info */
+ crypt_info = zio_crypt_table[crypt];
+
+ /* the mac will always be the last iovec_t in the cipher uio */
+ maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
+
+ ASSERT(maclen <= ZIO_DATA_MAC_LEN);
+
+ /* setup encryption mechanism (same as crypt) */
+ mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
+
+ /*
+ * Strangely, the ICP requires that plain_full_len must include
+ * the MAC length when decrypting, even though the UIO does not
+ * need to have the extra space allocated.
+ */
+ if (encrypt) {
+ plain_full_len = datalen;
+ } else {
+ plain_full_len = datalen + maclen;
+ }
+
+ /*
+ * setup encryption params (currently only AES CCM and AES GCM
+ * are supported)
+ */
+ if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
+ ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
+ ccmp.ulAuthDataSize = auth_len;
+ ccmp.authData = authbuf;
+ ccmp.ulMACSize = maclen;
+ ccmp.nonce = ivbuf;
+ ccmp.ulDataSize = plain_full_len;
+
+ mech.cm_param = (char *)(&ccmp);
+ mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
+ } else {
+ gcmp.ulIvLen = ZIO_DATA_IV_LEN;
+ gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
+ gcmp.ulAADLen = auth_len;
+ gcmp.pAAD = authbuf;
+ gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
+ gcmp.pIv = ivbuf;
+
+ mech.cm_param = (char *)(&gcmp);
+ mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+ }
+
+ /* populate the cipher and plain data structs. */
+ plaindata.cd_format = CRYPTO_DATA_UIO;
+ plaindata.cd_offset = 0;
+ plaindata.cd_uio = puio;
+ plaindata.cd_miscdata = NULL;
+ plaindata.cd_length = plain_full_len;
+
+ cipherdata.cd_format = CRYPTO_DATA_UIO;
+ cipherdata.cd_offset = 0;
+ cipherdata.cd_uio = cuio;
+ cipherdata.cd_miscdata = NULL;
+ cipherdata.cd_length = datalen + maclen;
+
+ /* perform the actual encryption */
+ if (encrypt) {
+ ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata,
+ NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+ } else {
+ ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata,
+ NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
+ ret = SET_ERROR(ECKSUM);
+ goto error;
+ }
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+ uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+ int ret;
+ uio_t puio, cuio;
+ uint64_t aad[3];
+ iovec_t plain_iovecs[2], cipher_iovecs[3];
+ uint64_t crypt = key->zk_crypt;
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* generate iv for wrapping the master and hmac key */
+ ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* initialize uio_ts */
+ plain_iovecs[0].iov_base = key->zk_master_keydata;
+ plain_iovecs[0].iov_len = keydata_len;
+ plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+ plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+ cipher_iovecs[0].iov_base = keydata_out;
+ cipher_iovecs[0].iov_len = keydata_len;
+ cipher_iovecs[1].iov_base = hmac_keydata_out;
+ cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+ cipher_iovecs[2].iov_base = mac;
+ cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+ /*
+ * Although we don't support writing to the old format, we do
+ * support rewrapping the key so that the user can move and
+ * quarantine datasets on the old format.
+ */
+ if (key->zk_version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(key->zk_guid);
+ } else {
+ ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(key->zk_guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(key->zk_version);
+ }
+
+ enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+ puio.uio_iov = plain_iovecs;
+ puio.uio_iovcnt = 2;
+ puio.uio_segflg = UIO_SYSSPACE;
+ cuio.uio_iov = cipher_iovecs;
+ cuio.uio_iovcnt = 3;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* encrypt the keys and store the resulting ciphertext and mac */
+ ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
+ &puio, &cuio, (uint8_t *)aad, aad_len);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+ uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+ uint8_t *mac, zio_crypt_key_t *key)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ uio_t puio, cuio;
+ uint64_t aad[3];
+ iovec_t plain_iovecs[2], cipher_iovecs[3];
+ uint_t enc_len, keydata_len, aad_len;
+
+ ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+ ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+ rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+ keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+ /* initialize uio_ts */
+ plain_iovecs[0].iov_base = key->zk_master_keydata;
+ plain_iovecs[0].iov_len = keydata_len;
+ plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+ plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+ cipher_iovecs[0].iov_base = keydata;
+ cipher_iovecs[0].iov_len = keydata_len;
+ cipher_iovecs[1].iov_base = hmac_keydata;
+ cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+ cipher_iovecs[2].iov_base = mac;
+ cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+ if (version == 0) {
+ aad_len = sizeof (uint64_t);
+ aad[0] = LE_64(guid);
+ } else {
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+ aad_len = sizeof (uint64_t) * 3;
+ aad[0] = LE_64(guid);
+ aad[1] = LE_64(crypt);
+ aad[2] = LE_64(version);
+ }
+
+ enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+ puio.uio_iov = plain_iovecs;
+ puio.uio_segflg = UIO_SYSSPACE;
+ puio.uio_iovcnt = 2;
+ cuio.uio_iov = cipher_iovecs;
+ cuio.uio_iovcnt = 3;
+ cuio.uio_segflg = UIO_SYSSPACE;
+
+ /* decrypt the keys and store the result in the output buffers */
+ ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
+ &puio, &cuio, (uint8_t *)aad, aad_len);
+ if (ret != 0)
+ goto error;
+
+ /* generate a fresh salt */
+ ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+ if (ret != 0)
+ goto error;
+
+ /* derive the current key from the master key */
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+ keydata_len);
+ if (ret != 0)
+ goto error;
+
+ /* initialize keys for ICP */
+ key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_current_key.ck_data = key->zk_current_keydata;
+ key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+ key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+ key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+ /*
+ * Initialize the crypto templates. It's ok if this fails because
+ * this is just an optimization.
+ */
+ mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+ ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+ &key->zk_current_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_current_tmpl = NULL;
+
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+ &key->zk_hmac_tmpl, KM_SLEEP);
+ if (ret != CRYPTO_SUCCESS)
+ key->zk_hmac_tmpl = NULL;
+
+ key->zk_crypt = crypt;
+ key->zk_version = version;
+ key->zk_guid = guid;
+ key->zk_salt_count = 0;
+
+ return (0);
+
+error:
+ zio_crypt_key_destroy(key);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+ int ret;
+
+ /* randomly generate the IV */
+ ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+ if (ret != 0)
+ goto error;
+
+ return (0);
+
+error:
+ bzero(ivbuf, ZIO_DATA_IV_LEN);
+ return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+ uint8_t *digestbuf, uint_t digestlen)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_data_t in_data, digest_data;
+ uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+ ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+ /* initialize sha512-hmac mechanism and crypto data */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ /* initialize the crypto data */
+ in_data.cd_format = CRYPTO_DATA_RAW;
+ in_data.cd_offset = 0;
+ in_data.cd_length = datalen;
+ in_data.cd_raw.iov_base = (char *)data;
+ in_data.cd_raw.iov_len = in_data.cd_length;
+
+ digest_data.cd_format = CRYPTO_DATA_RAW;
+ digest_data.cd_offset = 0;
+ digest_data.cd_length = SHA512_DIGEST_LENGTH;
+ digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
+ digest_data.cd_raw.iov_len = digest_data.cd_length;
+
+ /* generate the hmac */
+ ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
+ &digest_data, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_digestbuf, digestbuf, digestlen);
+
+ return (0);
+
+error:
+ bzero(digestbuf, digestlen);
+ return (ret);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+ uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+ int ret;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ ret = zio_crypt_do_hmac(key, data, datalen,
+ digestbuf, SHA512_DIGEST_LENGTH);
+ if (ret != 0)
+ return (ret);
+
+ bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+ bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+ return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_ENCRYPTED(bp));
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+ bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, val32);
+ } else {
+ bcopy(salt, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+ bcopy(iv, &val64, sizeof (uint64_t));
+ bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+ bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+ BP_SET_IV2(bp, BSWAP_32(val32));
+ }
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+ uint64_t val64;
+ uint32_t val32;
+
+ ASSERT(BP_IS_PROTECTED(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_IS_AUTHENTICATED(bp)) {
+ bzero(salt, ZIO_DATA_SALT_LEN);
+ bzero(iv, ZIO_DATA_IV_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+ bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+ val32 = (uint32_t)BP_GET_IV2(bp);
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+ bcopy(&val64, salt, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+ bcopy(&val64, iv, sizeof (uint64_t));
+
+ val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+ bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp));
+ ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+ sizeof (uint64_t));
+ } else {
+ bcopy(mac, &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+ bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+ bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+ }
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+ uint64_t val64;
+
+ ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+ /* for convenience, so callers don't need to check */
+ if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+ bzero(mac, ZIO_DATA_MAC_LEN);
+ return;
+ }
+
+ if (!BP_SHOULD_BYTESWAP(bp)) {
+ bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+ } else {
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+ bcopy(&val64, mac, sizeof (uint64_t));
+
+ val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+ bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+ }
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+ zil_chain_t *zilc = data;
+
+ bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+ bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+ sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+ /*
+ * The ZIL MAC is embedded in the block it protects, which will
+ * not have been byteswapped by the time this function has been called.
+ * As a result, we don't need to worry about byteswapping the MAC.
+ */
+ const zil_chain_t *zilc = data;
+
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+ bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+ sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+ uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+ uint8_t *src;
+ dnode_phys_t *dnp, *sdnp, *ddnp;
+
+ src = abd_borrow_buf_copy(src_abd, datalen);
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+ DN_MAX_BONUS_LEN(dnp));
+ }
+ }
+
+ abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+ /*
+ * Version 0 did not properly zero out all non-portable fields
+ * as it should have done. We maintain this code so that we can
+ * do read-only imports of pools on this version.
+ */
+ if (version == 0) {
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+ return;
+ }
+
+ ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+ /*
+ * The hole_birth feature might set these fields even if this bp
+ * is a hole. We zero them out here to guarantee that raw sends
+ * will function with or without the feature.
+ */
+ if (BP_IS_HOLE(bp)) {
+ bp->blk_prop = 0ULL;
+ return;
+ }
+
+ /*
+ * At L0 we want to verify these fields to ensure that data blocks
+ * can not be reinterpreted. For instance, we do not want an attacker
+ * to trick us into returning raw lz4 compressed data to the user
+ * by modifying the compression bits. At higher levels, we cannot
+ * enforce this policy since raw sends do not convey any information
+ * about indirect blocks, so these values might be different on the
+ * receive side. Fortunately, this does not open any new attack
+ * vectors, since any alterations that can be made to a higher level
+ * bp must still verify the correct order of the layer below it.
+ */
+ if (BP_GET_LEVEL(bp) != 0) {
+ BP_SET_BYTEORDER(bp, 0);
+ BP_SET_COMPRESS(bp, 0);
+
+ /*
+ * psize cannot be set to zero or it will trigger
+ * asserts, but the value doesn't really matter as
+ * long as it is constant.
+ */
+ BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+ }
+
+ BP_SET_DEDUP(bp, 0);
+ BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+ blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+ blkptr_t tmpbp = *bp;
+
+ if (should_bswap)
+ byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+ ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+ ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+ zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+ /*
+ * We always MAC blk_prop in LE to ensure portability. This
+ * must be done after decoding the mac, since the endianness
+ * will get zero'd out here.
+ */
+ zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+ bab->bab_prop = LE_64(tmpbp.blk_prop);
+ bab->bab_pad = 0ULL;
+
+ /* version 0 did not include the padding */
+ *bab_len = sizeof (blkptr_auth_buf_t);
+ if (version == 0)
+ *bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ int ret;
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+ crypto_data_t cd;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+ cd.cd_length = bab_len;
+ cd.cd_raw.iov_base = (char *)&bab;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+ boolean_t should_bswap, blkptr_t *bp)
+{
+ uint_t bab_len;
+ blkptr_auth_buf_t bab;
+
+ zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+ bcopy(&bab, *aadp, bab_len);
+ *aadp += bab_len;
+ *aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+ boolean_t should_bswap, dnode_phys_t *dnp)
+{
+ int ret, i;
+ dnode_phys_t *adnp;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ crypto_data_t cd;
+ uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ /* authenticate the core dnode (masking out non-portable bits) */
+ bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+ adnp = (dnode_phys_t *)tmp_dncore;
+ if (le_bswap) {
+ adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+ adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+ adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+ adnp->dn_used = BSWAP_64(adnp->dn_used);
+ }
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+
+ cd.cd_length = sizeof (tmp_dncore);
+ cd.cd_raw.iov_base = (char *)adnp;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ for (i = 0; i < dnp->dn_nblkptr; i++) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, &dnp->dn_blkptr[i]);
+ if (ret != 0)
+ goto error;
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+ should_bswap, DN_SPILL_BLKPTR(dnp));
+ if (ret != 0)
+ goto error;
+ }
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+ boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+ int ret;
+ crypto_mechanism_t mech;
+ crypto_context_t ctx;
+ crypto_data_t cd;
+ objset_phys_t *osp = data;
+ uint64_t intval;
+ boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+ uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+ uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+ /* initialize HMAC mechanism */
+ mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+ mech.cm_param = NULL;
+ mech.cm_param_len = 0;
+
+ cd.cd_format = CRYPTO_DATA_RAW;
+ cd.cd_offset = 0;
+
+ /* calculate the portable MAC from the portable fields and metadnode */
+ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the os_type */
+ intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in fields from the metadnode */
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_meta_dnode);
+ if (ret)
+ goto error;
+
+ /* store the final digest in a temporary buffer and copy what we need */
+ cd.cd_length = SHA512_DIGEST_LENGTH;
+ cd.cd_raw.iov_base = (char *)raw_portable_mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+ /*
+ * The local MAC protects the user, group and project accounting.
+ * If these objects are not present, the local MAC is zeroed out.
+ */
+ if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen >= OBJSET_PHYS_SIZE_V2 &&
+ osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+ osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+ (datalen <= OBJSET_PHYS_SIZE_V1)) {
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (0);
+ }
+
+ /* calculate the local MAC from the userused and groupused dnodes */
+ ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in the non-portable os_flags */
+ intval = osp->os_flags;
+ if (should_bswap)
+ intval = BSWAP_64(intval);
+ intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+ if (!ZFS_HOST_BYTEORDER)
+ intval = BSWAP_64(intval);
+
+ cd.cd_length = sizeof (uint64_t);
+ cd.cd_raw.iov_base = (char *)&intval;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_update(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ /* add in fields from the user accounting dnodes */
+ if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_userused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_groupused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+ datalen >= OBJSET_PHYS_SIZE_V3) {
+ ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+ should_bswap, &osp->os_projectused_dnode);
+ if (ret)
+ goto error;
+ }
+
+ /* store the final digest in a temporary buffer and copy what we need */
+ cd.cd_length = SHA512_DIGEST_LENGTH;
+ cd.cd_raw.iov_base = (char *)raw_local_mac;
+ cd.cd_raw.iov_len = cd.cd_length;
+
+ ret = crypto_mac_final(ctx, &cd, NULL);
+ if (ret != CRYPTO_SUCCESS) {
+ ret = SET_ERROR(EIO);
+ goto error;
+ }
+
+ bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+ return (0);
+
+error:
+ bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+ bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+ return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(uio_t *uio)
+{
+ if (uio->uio_iov)
+ kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+ uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+ blkptr_t *bp;
+ int i, epb = datalen >> SPA_BLKPTRSHIFT;
+ SHA2_CTX ctx;
+ uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+ /* checksum all of the MACs from the layer below */
+ SHA2Init(SHA512, &ctx);
+ for (i = 0, bp = buf; i < epb; i++, bp++) {
+ zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+ byteswap, bp);
+ }
+ SHA2Final(digestbuf, &ctx);
+
+ if (generate) {
+ bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+ return (0);
+ }
+
+ if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
+ return (SET_ERROR(ECKSUM));
+
+ return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+
+ /*
+ * Unfortunately, callers of this function will not always have
+ * easy access to the on-disk format version. This info is
+ * normally found in the DSL Crypto Key, but the checksum-of-MACs
+ * is expected to be verifiable even when the key isn't loaded.
+ * Here, instead of doing a ZAP lookup for the version for each
+ * zio, we simply try both existing formats.
+ */
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+ datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+ if (ret == ECKSUM) {
+ ASSERT(!generate);
+ ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+ buf, datalen, 0, byteswap, cksum);
+ }
+
+ return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+ uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+ int ret;
+ void *buf;
+
+ buf = abd_borrow_buf_copy(abd, datalen);
+ ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+ byteswap, cksum);
+ abd_return_buf(abd, buf, datalen);
+
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio,
+ uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+ boolean_t *no_crypt)
+{
+ int ret;
+ uint64_t txtype, lr_len;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+ zil_chain_t *zilc;
+ lr_t *lr;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ /* cipherbuf always needs an extra iovec for the MAC */
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ /* find the start and end record of the log block */
+ zilc = (zil_chain_t *)src;
+ slrp = src + sizeof (zil_chain_t);
+ aadp = aadbuf;
+ blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+ /* calculate the number of encrypted iovecs we will need */
+ for (; slrp < blkend; slrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ nr_iovecs++;
+ if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+ nr_iovecs++;
+ }
+
+ nr_src += nr_iovecs;
+ nr_dst += nr_iovecs;
+
+ /* allocate the iovec arrays */
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (src_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (dst_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ /*
+ * Copy the plain zil header over and authenticate everything except
+ * the checksum that will store our MAC. If we are writing the data
+ * the embedded checksum will not have been calculated yet, so we don't
+ * authenticate that.
+ */
+ bcopy(src, dst, sizeof (zil_chain_t));
+ bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+ aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+ aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+ /* loop over records again, filling in iovecs */
+ nr_iovecs = 0;
+ slrp = src + sizeof (zil_chain_t);
+ dlrp = dst + sizeof (zil_chain_t);
+
+ for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+ lr = (lr_t *)slrp;
+
+ if (!byteswap) {
+ txtype = lr->lrc_txtype;
+ lr_len = lr->lrc_reclen;
+ } else {
+ txtype = BSWAP_64(lr->lrc_txtype);
+ lr_len = BSWAP_64(lr->lrc_reclen);
+ }
+
+ /* copy the common lr_t */
+ bcopy(slrp, dlrp, sizeof (lr_t));
+ bcopy(slrp, aadp, sizeof (lr_t));
+ aadp += sizeof (lr_t);
+ aad_len += sizeof (lr_t);
+
+ ASSERT3P(src_iovecs, !=, NULL);
+ ASSERT3P(dst_iovecs, !=, NULL);
+
+ /*
+ * If this is a TX_WRITE record we want to encrypt everything
+ * except the bp if exists. If the bp does exist we want to
+ * authenticate it.
+ */
+ if (txtype == TX_WRITE) {
+ crypt_len = sizeof (lr_write_t) -
+ sizeof (lr_t) - sizeof (blkptr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ /* copy the bp now since it will not be encrypted */
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ sizeof (blkptr_t));
+ bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+ aadp, sizeof (blkptr_t));
+ aadp += sizeof (blkptr_t);
+ aad_len += sizeof (blkptr_t);
+ nr_iovecs++;
+ total_len += crypt_len;
+
+ if (lr_len != sizeof (lr_write_t)) {
+ crypt_len = lr_len - sizeof (lr_write_t);
+ src_iovecs[nr_iovecs].iov_base =
+ slrp + sizeof (lr_write_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base =
+ dlrp + sizeof (lr_write_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ } else {
+ crypt_len = lr_len - sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+ nr_iovecs++;
+ total_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ if (encrypt) {
+ puio->uio_iov = src_iovecs;
+ puio->uio_iovcnt = nr_src;
+ cuio->uio_iov = dst_iovecs;
+ cuio->uio_iovcnt = nr_dst;
+ } else {
+ puio->uio_iov = dst_iovecs;
+ puio->uio_iovcnt = nr_dst;
+ cuio->uio_iov = src_iovecs;
+ cuio->uio_iovcnt = nr_src;
+ }
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ uint_t nr_src, nr_dst, crypt_len;
+ uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+ uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+ iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+ uint8_t *src, *dst, *aadp;
+ dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+ uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+ if (encrypt) {
+ src = plainbuf;
+ dst = cipherbuf;
+ nr_src = 0;
+ nr_dst = 1;
+ } else {
+ src = cipherbuf;
+ dst = plainbuf;
+ nr_src = 1;
+ nr_dst = 0;
+ }
+
+ sdnp = (dnode_phys_t *)src;
+ ddnp = (dnode_phys_t *)dst;
+ aadp = aadbuf;
+
+ /*
+ * Count the number of iovecs we will need to do the encryption by
+ * counting the number of bonus buffers that need to be encrypted.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ /*
+ * This block may still be byteswapped. However, all of the
+ * values we use are either uint8_t's (for which byteswapping
+ * is a noop) or a * != 0 check, which will work regardless
+ * of whether or not we byteswap.
+ */
+ if (sdnp[i].dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+ sdnp[i].dn_bonuslen != 0) {
+ nr_iovecs++;
+ }
+ }
+
+ nr_src += nr_iovecs;
+ nr_dst += nr_iovecs;
+
+ if (nr_src != 0) {
+ src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+ if (src_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ if (nr_dst != 0) {
+ dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+ if (dst_iovecs == NULL) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+ }
+
+ nr_iovecs = 0;
+
+ /*
+ * Iterate through the dnodes again, this time filling in the uios
+ * we allocated earlier. We also concatenate any data we want to
+ * authenticate onto aadbuf.
+ */
+ for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+ dnp = &sdnp[i];
+
+ /* copy over the core fields and blkptrs (kept as plaintext) */
+ bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+ sizeof (blkptr_t));
+ }
+
+ /*
+ * Handle authenticated data. We authenticate everything in
+ * the dnode that can be brought over when we do a raw send.
+ * This includes all of the core fields as well as the MACs
+ * stored in the bp checksums and all of the portable bits
+ * from blk_prop. We include the dnode padding here in case it
+ * ever gets used in the future. Some dn_flags and dn_used are
+ * not portable so we mask those out values out of the
+ * authenticated data.
+ */
+ crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+ bcopy(dnp, aadp, crypt_len);
+ adnp = (dnode_phys_t *)aadp;
+ adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+ adnp->dn_used = 0;
+ aadp += crypt_len;
+ aad_len += crypt_len;
+
+ for (j = 0; j < dnp->dn_nblkptr; j++) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, &dnp->dn_blkptr[j]);
+ }
+
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+ version, byteswap, DN_SPILL_BLKPTR(dnp));
+ }
+
+ /*
+ * If this bonus buffer needs to be encrypted, we prepare an
+ * iovec_t. The encryption / decryption functions will fill
+ * this in for us with the encrypted or decrypted data.
+ * Otherwise we add the bonus buffer to the authenticated
+ * data buffer and copy it over to the destination. The
+ * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+ * we can guarantee alignment with the AES block size
+ * (128 bits).
+ */
+ crypt_len = DN_MAX_BONUS_LEN(dnp);
+ if (dnp->dn_type != DMU_OT_NONE &&
+ DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+ dnp->dn_bonuslen != 0) {
+ ASSERT3U(nr_iovecs, <, nr_src);
+ ASSERT3U(nr_iovecs, <, nr_dst);
+ ASSERT3P(src_iovecs, !=, NULL);
+ ASSERT3P(dst_iovecs, !=, NULL);
+ src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
+ src_iovecs[nr_iovecs].iov_len = crypt_len;
+ dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+ dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+ nr_iovecs++;
+ total_len += crypt_len;
+ } else {
+ bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+ bcopy(DN_BONUS(dnp), aadp, crypt_len);
+ aadp += crypt_len;
+ aad_len += crypt_len;
+ }
+ }
+
+ *no_crypt = (nr_iovecs == 0);
+ *enc_len = total_len;
+ *authbuf = aadbuf;
+ *auth_len = aad_len;
+
+ if (encrypt) {
+ puio->uio_iov = src_iovecs;
+ puio->uio_iovcnt = nr_src;
+ cuio->uio_iov = dst_iovecs;
+ cuio->uio_iovcnt = nr_dst;
+ } else {
+ puio->uio_iov = dst_iovecs;
+ puio->uio_iovcnt = nr_dst;
+ cuio->uio_iov = src_iovecs;
+ cuio->uio_iovcnt = nr_src;
+ }
+
+ return (0);
+
+error:
+ zio_buf_free(aadbuf, datalen);
+ if (src_iovecs != NULL)
+ kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+ if (dst_iovecs != NULL)
+ kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+ *enc_len = 0;
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+ uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio,
+ uint_t *enc_len)
+{
+ int ret;
+ uint_t nr_plain = 1, nr_cipher = 2;
+ iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+
+ /* allocate the iovecs for the plain and cipher data */
+ plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!plain_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+ KM_SLEEP);
+ if (!cipher_iovecs) {
+ ret = SET_ERROR(ENOMEM);
+ goto error;
+ }
+
+ plain_iovecs[0].iov_base = plainbuf;
+ plain_iovecs[0].iov_len = datalen;
+ cipher_iovecs[0].iov_base = cipherbuf;
+ cipher_iovecs[0].iov_len = datalen;
+
+ *enc_len = datalen;
+ puio->uio_iov = plain_iovecs;
+ puio->uio_iovcnt = nr_plain;
+ cuio->uio_iov = cipher_iovecs;
+ cuio->uio_iovcnt = nr_cipher;
+
+ return (0);
+
+error:
+ if (plain_iovecs != NULL)
+ kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+ if (cipher_iovecs != NULL)
+ kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+ *enc_len = 0;
+ puio->uio_iov = NULL;
+ puio->uio_iovcnt = 0;
+ cuio->uio_iov = NULL;
+ cuio->uio_iovcnt = 0;
+ return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+ uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+ uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+ uint_t *auth_len, boolean_t *no_crypt)
+{
+ int ret;
+ iovec_t *mac_iov;
+
+ ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+ /* route to handler */
+ switch (ot) {
+ case DMU_OT_INTENT_LOG:
+ ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+ datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+ no_crypt);
+ break;
+ case DMU_OT_DNODE:
+ ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+ cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+ auth_len, no_crypt);
+ break;
+ default:
+ ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+ datalen, puio, cuio, enc_len);
+ *authbuf = NULL;
+ *auth_len = 0;
+ *no_crypt = B_FALSE;
+ break;
+ }
+
+ if (ret != 0)
+ goto error;
+
+ /* populate the uios */
+ puio->uio_segflg = UIO_SYSSPACE;
+ cuio->uio_segflg = UIO_SYSSPACE;
+
+ mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+ mac_iov->iov_base = mac;
+ mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+ return (0);
+
+error:
+ return (ret);
+}
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+ dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+ uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+ boolean_t *no_crypt)
+{
+ int ret;
+ boolean_t locked = B_FALSE;
+ uint64_t crypt = key->zk_crypt;
+ uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+ uint_t enc_len, auth_len;
+ uio_t puio, cuio;
+ uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+ crypto_key_t tmp_ckey, *ckey = NULL;
+ crypto_ctx_template_t tmpl;
+ uint8_t *authbuf = NULL;
+
+ /*
+ * If the needed key is the current one, just use it. Otherwise we
+ * need to generate a temporary one from the given salt + master key.
+ * If we are encrypting, we must return a copy of the current salt
+ * so that it can be stored in the blkptr_t.
+ */
+ rw_enter(&key->zk_salt_lock, RW_READER);
+ locked = B_TRUE;
+
+ if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+ ckey = &key->zk_current_key;
+ tmpl = key->zk_current_tmpl;
+ } else {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+
+ ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+ salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+ if (ret != 0)
+ goto error;
+
+ tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+ tmp_ckey.ck_data = enc_keydata;
+ tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+ ckey = &tmp_ckey;
+ tmpl = NULL;
+ }
+
+ /*
+ * Attempt to use QAT acceleration if we can. We currently don't
+ * do this for metadnode and ZIL blocks, since they have a much
+ * more involved buffer layout and the qat_crypt() function only
+ * works in-place.
+ */
+ if (qat_crypt_use_accel(datalen) &&
+ ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
+ uint8_t *srcbuf, *dstbuf;
+
+ if (encrypt) {
+ srcbuf = plainbuf;
+ dstbuf = cipherbuf;
+ } else {
+ srcbuf = cipherbuf;
+ dstbuf = plainbuf;
+ }
+
+ ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
+ dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
+ if (ret == CPA_STATUS_SUCCESS) {
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ return (0);
+ }
+ /* If the hardware implementation fails fall back to software */
+ }
+
+ bzero(&puio, sizeof (uio_t));
+ bzero(&cuio, sizeof (uio_t));
+
+ /* create uios for encryption */
+ ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+ cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+ &authbuf, &auth_len, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ /* perform the encryption / decryption in software */
+ ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
+ &puio, &cuio, authbuf, auth_len);
+ if (ret != 0)
+ goto error;
+
+ if (locked) {
+ rw_exit(&key->zk_salt_lock);
+ locked = B_FALSE;
+ }
+
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (0);
+
+error:
+ if (locked)
+ rw_exit(&key->zk_salt_lock);
+ if (authbuf != NULL)
+ zio_buf_free(authbuf, datalen);
+ if (ckey == &tmp_ckey)
+ bzero(enc_keydata, keydata_len);
+ zio_crypt_destroy_uio(&puio);
+ zio_crypt_destroy_uio(&cuio);
+
+ return (ret);
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+ boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+ uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+ int ret;
+ void *ptmp, *ctmp;
+
+ if (encrypt) {
+ ptmp = abd_borrow_buf_copy(pabd, datalen);
+ ctmp = abd_borrow_buf(cabd, datalen);
+ } else {
+ ptmp = abd_borrow_buf(pabd, datalen);
+ ctmp = abd_borrow_buf_copy(cabd, datalen);
+ }
+
+ ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+ datalen, ptmp, ctmp, no_crypt);
+ if (ret != 0)
+ goto error;
+
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (0);
+
+error:
+ if (encrypt) {
+ abd_return_buf(pabd, ptmp, datalen);
+ abd_return_buf_copy(cabd, ctmp, datalen);
+ } else {
+ abd_return_buf_copy(pabd, ptmp, datalen);
+ abd_return_buf(cabd, ctmp, datalen);
+ }
+
+ return (ret);
+}
+
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+ "can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c
new file mode 100644
index 000000000..6df367b81
--- /dev/null
+++ b/module/os/linux/zfs/zpl_ctldir.c
@@ -0,0 +1,572 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ * Rohan Puri <[email protected]>
+ * Brian Behlendorf <[email protected]>
+ */
+
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+/*
+ * Common open routine. Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zpl_common_open(struct inode *ip, struct file *filp)
+{
+ if (filp->f_mode & FMODE_WRITE)
+ return (-EACCES);
+
+ return (generic_file_open(ip, filp));
+}
+
+/*
+ * Get root directory contents.
+ */
+static int
+zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (!zpl_dir_emit_dots(filp, ctx))
+ goto out;
+
+ if (ctx->pos == 2) {
+ if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME,
+ strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
+ goto out;
+
+ ctx->pos++;
+ }
+
+ if (ctx->pos == 3) {
+ if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME,
+ strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
+ goto out;
+
+ ctx->pos++;
+ }
+out:
+ ZFS_EXIT(zfsvfs);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_root_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *ip = path->dentry->d_inode;
+
+ generic_fillattr(ip, stat);
+ stat->atime = current_time(ip);
+
+ return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_root_getattr);
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, struct nameidata *nd)
+#else
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
+#endif
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ int error;
+
+ crhold(cr);
+ error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ if (error) {
+ if (error == -ENOENT)
+ return (d_splice_alias(NULL, dentry));
+ else
+ return (ERR_PTR(error));
+ }
+
+ return (d_splice_alias(ip, dentry));
+}
+
+/*
+ * The '.zfs' control directory file and inode operations.
+ */
+const struct file_operations zpl_fops_root = {
+ .open = zpl_common_open,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+ .iterate_shared = zpl_root_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_root_iterate,
+#else
+ .readdir = zpl_root_readdir,
+#endif
+};
+
+const struct inode_operations zpl_ops_root = {
+ .lookup = zpl_root_lookup,
+ .getattr = zpl_root_getattr,
+};
+
+#ifdef HAVE_AUTOMOUNT
+static struct vfsmount *
+zpl_snapdir_automount(struct path *path)
+{
+ int error;
+
+ error = -zfsctl_snapshot_mount(path, 0);
+ if (error)
+ return (ERR_PTR(error));
+
+ /*
+ * Rather than returning the new vfsmount for the snapshot we must
+ * return NULL to indicate a mount collision. This is done because
+ * the user space mount calls do_add_mount() which adds the vfsmount
+ * to the name space. If we returned the new mount here it would be
+ * added again to the vfsmount list resulting in list corruption.
+ */
+ return (NULL);
+}
+#endif /* HAVE_AUTOMOUNT */
+
+/*
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted. Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
+ */
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
+#else
+zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
+#endif
+{
+ return (!!dentry->d_inode);
+}
+
+dentry_operations_t zpl_dops_snapdirs = {
+/*
+ * Auto mounting of snapshots is only supported for 2.6.37 and
+ * newer kernels. Prior to this kernel the ops->follow_link()
+ * callback was used as a hack to trigger the mount. The
+ * resulting vfsmount was then explicitly grafted in to the
+ * name space. While it might be possible to add compatibility
+ * code to accomplish this it would require considerable care.
+ */
+#ifdef HAVE_AUTOMOUNT
+ .d_automount = zpl_snapdir_automount,
+#endif /* HAVE_AUTOMOUNT */
+ .d_revalidate = zpl_snapdir_revalidate,
+};
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+ struct nameidata *nd)
+#else
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+ unsigned int flags)
+#endif
+
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ struct inode *ip = NULL;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
+ 0, cr, NULL, NULL);
+ ASSERT3S(error, <=, 0);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error && error != -ENOENT)
+ return (ERR_PTR(error));
+
+ ASSERT(error == 0 || ip == NULL);
+ d_clear_d_op(dentry);
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+#ifdef HAVE_AUTOMOUNT
+ dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+#endif
+
+ return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+ fstrans_cookie_t cookie;
+ char snapname[MAXNAMELEN];
+ boolean_t case_conflict;
+ uint64_t id, pos;
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+ cookie = spl_fstrans_mark();
+
+ if (!zpl_dir_emit_dots(filp, ctx))
+ goto out;
+
+ pos = ctx->pos;
+ while (error == 0) {
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
+ snapname, &id, &pos, &case_conflict);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error)
+ goto out;
+
+ if (!zpl_dir_emit(ctx, snapname, strlen(snapname),
+ ZFSCTL_INO_SHARES - id, DT_DIR))
+ goto out;
+
+ ctx->pos = pos;
+ }
+out:
+ spl_fstrans_unmark(cookie);
+ ZFS_EXIT(zfsvfs);
+
+ if (error == -ENOENT)
+ return (0);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_snapdir_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+static int
+zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+ cred_t *cr = CRED();
+ int error;
+
+ /* We probably don't want to support renameat2(2) in ctldir */
+ if (flags)
+ return (-EINVAL);
+
+ crhold(cr);
+ error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
+ tdip, dname(tdentry), cr, 0);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry)
+{
+ return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ int error;
+
+ crhold(cr);
+ error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ return (error);
+}
+
+static int
+zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, zpl_umode_t mode)
+{
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ struct inode *ip;
+ int error;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
+
+ error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
+ if (error == 0) {
+ d_clear_d_op(dentry);
+ d_set_d_op(dentry, &zpl_dops_snapdirs);
+ d_instantiate(dentry, ip);
+ }
+
+ kmem_free(vap, sizeof (vattr_t));
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ return (error);
+}
+
+/*
+ * Get snapshot directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *ip = path->dentry->d_inode;
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+ ZFS_ENTER(zfsvfs);
+ generic_fillattr(ip, stat);
+
+ stat->nlink = stat->size = 2;
+ stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ stat->atime = current_time(ip);
+ ZFS_EXIT(zfsvfs);
+
+ return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
+
+/*
+ * The '.zfs/snapshot' directory file operations. These mainly control
+ * generating the list of available snapshots when doing an 'ls' in the
+ * directory. See zpl_snapdir_readdir().
+ */
+const struct file_operations zpl_fops_snapdir = {
+ .open = zpl_common_open,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+ .iterate_shared = zpl_snapdir_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_snapdir_iterate,
+#else
+ .readdir = zpl_snapdir_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/snapshot' directory inode operations. These mainly control
+ * creating an inode for a snapshot directory and initializing the needed
+ * infrastructure to automount the snapshot. See zpl_snapdir_lookup().
+ */
+const struct inode_operations zpl_ops_snapdir = {
+ .lookup = zpl_snapdir_lookup,
+ .getattr = zpl_snapdir_getattr,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+ .rename = zpl_snapdir_rename2,
+#else
+ .rename = zpl_snapdir_rename,
+#endif
+ .rmdir = zpl_snapdir_rmdir,
+ .mkdir = zpl_snapdir_mkdir,
+};
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+ struct nameidata *nd)
+#else
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+ unsigned int flags)
+#endif
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ struct inode *ip = NULL;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
+ 0, cr, NULL, NULL);
+ ASSERT3S(error, <=, 0);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error) {
+ if (error == -ENOENT)
+ return (d_splice_alias(NULL, dentry));
+ else
+ return (ERR_PTR(error));
+ }
+
+ return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+ znode_t *dzp;
+ int error = 0;
+
+ ZFS_ENTER(zfsvfs);
+ cookie = spl_fstrans_mark();
+
+ if (zfsvfs->z_shares_dir == 0) {
+ zpl_dir_emit_dots(filp, ctx);
+ goto out;
+ }
+
+ error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+ if (error)
+ goto out;
+
+ crhold(cr);
+ error = -zfs_readdir(ZTOI(dzp), ctx, cr);
+ crfree(cr);
+
+ iput(ZTOI(dzp));
+out:
+ spl_fstrans_unmark(cookie);
+ ZFS_EXIT(zfsvfs);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_shares_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/* ARGSUSED */
+static int
+zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *ip = path->dentry->d_inode;
+ zfsvfs_t *zfsvfs = ITOZSB(ip);
+ znode_t *dzp;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+
+ if (zfsvfs->z_shares_dir == 0) {
+ generic_fillattr(path->dentry->d_inode, stat);
+ stat->nlink = stat->size = 2;
+ stat->atime = current_time(ip);
+ ZFS_EXIT(zfsvfs);
+ return (0);
+ }
+
+ error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+ if (error == 0) {
+ error = -zfs_getattr_fast(ZTOI(dzp), stat);
+ iput(ZTOI(dzp));
+ }
+
+ ZFS_EXIT(zfsvfs);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
+
+/*
+ * The '.zfs/shares' directory file operations.
+ */
+const struct file_operations zpl_fops_shares = {
+ .open = zpl_common_open,
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+ .iterate_shared = zpl_shares_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_shares_iterate,
+#else
+ .readdir = zpl_shares_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/shares' directory inode operations.
+ */
+const struct inode_operations zpl_ops_shares = {
+ .lookup = zpl_shares_lookup,
+ .getattr = zpl_shares_getattr,
+};
diff --git a/module/os/linux/zfs/zpl_export.c b/module/os/linux/zfs/zpl_export.c
new file mode 100644
index 000000000..a264d664c
--- /dev/null
+++ b/module/os/linux/zfs/zpl_export.c
@@ -0,0 +1,177 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Gunnar Beutner
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ */
+
+
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static int
+#ifdef HAVE_ENCODE_FH_WITH_INODE
+zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
+{
+#else
+zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
+{
+ /* CSTYLED */
+ struct inode *ip = dentry->d_inode;
+#endif /* HAVE_ENCODE_FH_WITH_INODE */
+ fstrans_cookie_t cookie;
+ fid_t *fid = (fid_t *)fh;
+ int len_bytes, rc;
+
+ len_bytes = *max_len * sizeof (__u32);
+
+ if (len_bytes < offsetof(fid_t, fid_data))
+ return (255);
+
+ fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
+ cookie = spl_fstrans_mark();
+
+ if (zfsctl_is_node(ip))
+ rc = zfsctl_fid(ip, fid);
+ else
+ rc = zfs_fid(ip, fid);
+
+ spl_fstrans_unmark(cookie);
+ len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
+ *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
+
+ return (rc == 0 ? FILEID_INO32_GEN : 255);
+}
+
+static struct dentry *
+zpl_dentry_obtain_alias(struct inode *ip)
+{
+ struct dentry *result;
+
+#ifdef HAVE_D_OBTAIN_ALIAS
+ result = d_obtain_alias(ip);
+#else
+ result = d_alloc_anon(ip);
+
+ if (result == NULL) {
+ iput(ip);
+ result = ERR_PTR(-ENOMEM);
+ }
+#endif /* HAVE_D_OBTAIN_ALIAS */
+
+ return (result);
+}
+
+static struct dentry *
+zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
+ int fh_len, int fh_type)
+{
+ fid_t *fid = (fid_t *)fh;
+ fstrans_cookie_t cookie;
+ struct inode *ip;
+ int len_bytes, rc;
+
+ len_bytes = fh_len * sizeof (__u32);
+
+ if (fh_type != FILEID_INO32_GEN ||
+ len_bytes < offsetof(fid_t, fid_data) ||
+ len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
+ return (ERR_PTR(-EINVAL));
+
+ cookie = spl_fstrans_mark();
+ rc = zfs_vget(sb, &ip, fid);
+ spl_fstrans_unmark(cookie);
+
+ if (rc) {
+ /*
+ * If we see ENOENT it might mean that an NFSv4 * client
+ * is using a cached inode value in a file handle and
+ * that the sought after file has had its inode changed
+ * by a third party. So change the error to ESTALE
+ * which will trigger a full lookup by the client and
+ * will find the new filename/inode pair if it still
+ * exists.
+ */
+ if (rc == ENOENT)
+ rc = ESTALE;
+
+ return (ERR_PTR(-rc));
+ }
+
+ ASSERT((ip != NULL) && !IS_ERR(ip));
+
+ return (zpl_dentry_obtain_alias(ip));
+}
+
+static struct dentry *
+zpl_get_parent(struct dentry *child)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ struct inode *ip;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_lookup(child->d_inode, "..", &ip, 0, cr, NULL, NULL);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ if (error)
+ return (ERR_PTR(error));
+
+ return (zpl_dentry_obtain_alias(ip));
+}
+
+#ifdef HAVE_COMMIT_METADATA
+static int
+zpl_commit_metadata(struct inode *inode)
+{
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int error;
+
+ if (zfsctl_is_node(inode))
+ return (0);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(inode, 0, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+#endif /* HAVE_COMMIT_METADATA */
+
+const struct export_operations zpl_export_operations = {
+ .encode_fh = zpl_encode_fh,
+ .fh_to_dentry = zpl_fh_to_dentry,
+ .get_parent = zpl_get_parent,
+#ifdef HAVE_COMMIT_METADATA
+ .commit_metadata = zpl_commit_metadata,
+#endif /* HAVE_COMMIT_METADATA */
+};
diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c
new file mode 100644
index 000000000..acad4670d
--- /dev/null
+++ b/module/os/linux/zfs/zpl_file.c
@@ -0,0 +1,1075 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <sys/file.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_project.h>
+
+
+static int
+zpl_open(struct inode *ip, struct file *filp)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ error = generic_file_open(ip, filp);
+ if (error)
+ return (error);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_release(struct inode *ip, struct file *filp)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ if (ITOZ(ip)->z_atime_dirty)
+ zfs_mark_inode_dirty(ip);
+
+ crhold(cr);
+ error = -zfs_close(ip, filp->f_flags, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_readdir(file_inode(filp), ctx, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ zpl_dir_context_t ctx =
+ ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+ int error;
+
+ error = zpl_iterate(filp, &ctx);
+ filp->f_pos = ctx.pos;
+
+ return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+#if defined(HAVE_FSYNC_WITH_DENTRY)
+/*
+ * Linux 2.6.x - 2.6.34 API,
+ * Through 2.6.34 the nfsd kernel server would pass a NULL 'file struct *'
+ * to the fops->fsync() hook. For this reason, we must be careful not to
+ * use filp unconditionally.
+ */
+static int
+zpl_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(dentry->d_inode, datasync, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+ struct file *filp = kiocb->ki_filp;
+ return (zpl_fsync(filp, file_dentry(filp), datasync));
+}
+#endif
+
+#elif defined(HAVE_FSYNC_WITHOUT_DENTRY)
+/*
+ * Linux 2.6.35 - 3.0 API,
+ * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
+ * redundant. The dentry is still accessible via filp->f_path.dentry,
+ * and we are guaranteed that filp will never be NULL.
+ */
+static int
+zpl_fsync(struct file *filp, int datasync)
+{
+ struct inode *inode = filp->f_mapping->host;
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(inode, datasync, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+ return (zpl_fsync(kiocb->ki_filp, datasync));
+}
+#endif
+
+#elif defined(HAVE_FSYNC_RANGE)
+/*
+ * Linux 3.1 - 3.x API,
+ * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
+ * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex
+ * lock is no longer held by the caller, for zfs we don't require the lock
+ * to be held so we don't acquire it.
+ */
+static int
+zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+ struct inode *inode = filp->f_mapping->host;
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (error)
+ return (error);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_fsync(inode, datasync, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+ return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
+}
+#endif
+
+#else
+#error "Unsupported fops->fsync() implementation"
+#endif
+
+static inline int
+zfs_io_flags(struct kiocb *kiocb)
+{
+ int flags = 0;
+
+#if defined(IOCB_DSYNC)
+ if (kiocb->ki_flags & IOCB_DSYNC)
+ flags |= FDSYNC;
+#endif
+#if defined(IOCB_SYNC)
+ if (kiocb->ki_flags & IOCB_SYNC)
+ flags |= FSYNC;
+#endif
+#if defined(IOCB_APPEND)
+ if (kiocb->ki_flags & IOCB_APPEND)
+ flags |= FAPPEND;
+#endif
+#if defined(IOCB_DIRECT)
+ if (kiocb->ki_flags & IOCB_DIRECT)
+ flags |= FDIRECT;
+#endif
+ return (flags);
+}
+
+static ssize_t
+zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
+ unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
+ cred_t *cr, size_t skip)
+{
+ ssize_t read;
+ uio_t uio = { { 0 }, 0 };
+ int error;
+ fstrans_cookie_t cookie;
+
+ uio.uio_iov = iovp;
+ uio.uio_iovcnt = nr_segs;
+ uio.uio_loffset = *ppos;
+ uio.uio_segflg = segment;
+ uio.uio_limit = MAXOFFSET_T;
+ uio.uio_resid = count;
+ uio.uio_skip = skip;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_read(ip, &uio, flags, cr);
+ spl_fstrans_unmark(cookie);
+ if (error < 0)
+ return (error);
+
+ read = count - uio.uio_resid;
+ *ppos += read;
+
+ return (read);
+}
+
+inline ssize_t
+zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
+ uio_seg_t segment, int flags, cred_t *cr)
+{
+ struct iovec iov;
+
+ iov.iov_base = (void *)buf;
+ iov.iov_len = len;
+
+ return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment,
+ flags, cr, 0));
+}
+
+static ssize_t
+zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp,
+ unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
+{
+ cred_t *cr = CRED();
+ struct file *filp = kiocb->ki_filp;
+ struct inode *ip = filp->f_mapping->host;
+ zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip));
+ ssize_t read;
+ unsigned int f_flags = filp->f_flags;
+
+ f_flags |= zfs_io_flags(kiocb);
+ crhold(cr);
+ read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count,
+ nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
+ crfree(cr);
+
+ /*
+ * If relatime is enabled, call file_accessed() only if
+ * zfs_relatime_need_update() is true. This is needed since datasets
+ * with inherited "relatime" property aren't necessarily mounted with
+ * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what
+ * relatime test in VFS by relatime_need_update() is based on.
+ */
+ if (!IS_NOATIME(ip) && zfsvfs->z_relatime) {
+ if (zfs_relatime_need_update(ip))
+ file_accessed(filp);
+ } else {
+ file_accessed(filp);
+ }
+
+ return (read);
+}
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
+{
+ ssize_t ret;
+ uio_seg_t seg = UIO_USERSPACE;
+ if (to->type & ITER_KVEC)
+ seg = UIO_SYSSPACE;
+ if (to->type & ITER_BVEC)
+ seg = UIO_BVEC;
+ ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs,
+ iov_iter_count(to), seg, to->iov_offset);
+ if (ret > 0)
+ iov_iter_advance(to, ret);
+ return (ret);
+}
+#else
+static ssize_t
+zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp,
+ unsigned long nr_segs, loff_t pos)
+{
+ ssize_t ret;
+ size_t count;
+
+ ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE);
+ if (ret)
+ return (ret);
+
+ return (zpl_iter_read_common(kiocb, iovp, nr_segs, count,
+ UIO_USERSPACE, 0));
+}
+#endif /* HAVE_VFS_RW_ITERATE */
+
+static ssize_t
+zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count,
+ unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags,
+ cred_t *cr, size_t skip)
+{
+ ssize_t wrote;
+ uio_t uio = { { 0 }, 0 };
+ int error;
+ fstrans_cookie_t cookie;
+
+ if (flags & O_APPEND)
+ *ppos = i_size_read(ip);
+
+ uio.uio_iov = iovp;
+ uio.uio_iovcnt = nr_segs;
+ uio.uio_loffset = *ppos;
+ uio.uio_segflg = segment;
+ uio.uio_limit = MAXOFFSET_T;
+ uio.uio_resid = count;
+ uio.uio_skip = skip;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_write(ip, &uio, flags, cr);
+ spl_fstrans_unmark(cookie);
+ if (error < 0)
+ return (error);
+
+ wrote = count - uio.uio_resid;
+ *ppos += wrote;
+
+ return (wrote);
+}
+
+inline ssize_t
+zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos,
+ uio_seg_t segment, int flags, cred_t *cr)
+{
+ struct iovec iov;
+
+ iov.iov_base = (void *)buf;
+ iov.iov_len = len;
+
+ return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment,
+ flags, cr, 0));
+}
+
+static ssize_t
+zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp,
+ unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip)
+{
+ cred_t *cr = CRED();
+ struct file *filp = kiocb->ki_filp;
+ ssize_t wrote;
+ unsigned int f_flags = filp->f_flags;
+
+ f_flags |= zfs_io_flags(kiocb);
+ crhold(cr);
+ wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count,
+ nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip);
+ crfree(cr);
+
+ return (wrote);
+}
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
+{
+ size_t count;
+ ssize_t ret;
+ uio_seg_t seg = UIO_USERSPACE;
+
+#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB
+ struct file *file = kiocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *ip = mapping->host;
+ int isblk = S_ISBLK(ip->i_mode);
+
+ count = iov_iter_count(from);
+ ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk);
+ if (ret)
+ return (ret);
+#else
+ /*
+ * XXX - ideally this check should be in the same lock region with
+ * write operations, so that there's no TOCTTOU race when doing
+ * append and someone else grow the file.
+ */
+ ret = generic_write_checks(kiocb, from);
+ if (ret <= 0)
+ return (ret);
+ count = ret;
+#endif
+
+ if (from->type & ITER_KVEC)
+ seg = UIO_SYSSPACE;
+ if (from->type & ITER_BVEC)
+ seg = UIO_BVEC;
+
+ ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs,
+ count, seg, from->iov_offset);
+ if (ret > 0)
+ iov_iter_advance(from, ret);
+
+ return (ret);
+}
+#else
+static ssize_t
+zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct file *file = kiocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *ip = mapping->host;
+ int isblk = S_ISBLK(ip->i_mode);
+ size_t count;
+ ssize_t ret;
+
+ ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ);
+ if (ret)
+ return (ret);
+
+ ret = generic_write_checks(file, &pos, &count, isblk);
+ if (ret)
+ return (ret);
+
+ return (zpl_iter_write_common(kiocb, iovp, nr_segs, count,
+ UIO_USERSPACE, 0));
+}
+#endif /* HAVE_VFS_RW_ITERATE */
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+{
+ if (rw == WRITE)
+ return (zpl_iter_write(kiocb, iter));
+ else
+ return (zpl_iter_read(kiocb, iter));
+}
+#if defined(HAVE_VFS_DIRECT_IO_ITER)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
+{
+ return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+ ASSERT3S(pos, ==, kiocb->ki_pos);
+ return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+ ASSERT3S(pos, ==, kiocb->ki_pos);
+ return (zpl_direct_IO_impl(rw, kiocb, iter));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#else
+
+#if defined(HAVE_VFS_DIRECT_IO_IOVEC)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp,
+ loff_t pos, unsigned long nr_segs)
+{
+ if (rw == WRITE)
+ return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
+ else
+ return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#endif /* HAVE_VFS_RW_ITERATE */
+
+static loff_t
+zpl_llseek(struct file *filp, loff_t offset, int whence)
+{
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+ fstrans_cookie_t cookie;
+
+ if (whence == SEEK_DATA || whence == SEEK_HOLE) {
+ struct inode *ip = filp->f_mapping->host;
+ loff_t maxbytes = ip->i_sb->s_maxbytes;
+ loff_t error;
+
+ spl_inode_lock_shared(ip);
+ cookie = spl_fstrans_mark();
+ error = -zfs_holey(ip, whence, &offset);
+ spl_fstrans_unmark(cookie);
+ if (error == 0)
+ error = lseek_execute(filp, ip, offset, maxbytes);
+ spl_inode_unlock_shared(ip);
+
+ return (error);
+ }
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+ return (generic_file_llseek(filp, offset, whence));
+}
+
+/*
+ * It's worth taking a moment to describe how mmap is implemented
+ * for zfs because it differs considerably from other Linux filesystems.
+ * However, this issue is handled the same way under OpenSolaris.
+ *
+ * The issue is that by design zfs bypasses the Linux page cache and
+ * leaves all caching up to the ARC. This has been shown to work
+ * well for the common read(2)/write(2) case. However, mmap(2)
+ * is problem because it relies on being tightly integrated with the
+ * page cache. To handle this we cache mmap'ed files twice, once in
+ * the ARC and a second time in the page cache. The code is careful
+ * to keep both copies synchronized.
+ *
+ * When a file with an mmap'ed region is written to using write(2)
+ * both the data in the ARC and existing pages in the page cache
+ * are updated. For a read(2) data will be read first from the page
+ * cache then the ARC if needed. Neither a write(2) or read(2) will
+ * will ever result in new pages being added to the page cache.
+ *
+ * New pages are added to the page cache only via .readpage() which
+ * is called when the vfs needs to read a page off disk to back the
+ * virtual memory region. These pages may be modified without
+ * notifying the ARC and will be written out periodically via
+ * .writepage(). This will occur due to either a sync or the usual
+ * page aging behavior. Note because a read(2) of a mmap'ed file
+ * will always check the page cache first even when the ARC is out
+ * of date correct data will still be returned.
+ *
+ * While this implementation ensures correct behavior it does have
+ * have some drawbacks. The most obvious of which is that it
+ * increases the required memory footprint when access mmap'ed
+ * files. It also adds additional complexity to the code keeping
+ * both caches synchronized.
+ *
+ * Longer term it may be possible to cleanly resolve this wart by
+ * mapping page cache pages directly on to the ARC buffers. The
+ * Linux address space operations are flexible enough to allow
+ * selection of which pages back a particular index. The trick
+ * would be working out the details of which subsystem is in
+ * charge, the ARC, the page cache, or both. It may also prove
+ * helpful to move the ARC buffers to a scatter-gather lists
+ * rather than a vmalloc'ed region.
+ */
+static int
+zpl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct inode *ip = filp->f_mapping->host;
+ znode_t *zp = ITOZ(ip);
+ int error;
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
+ (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
+ spl_fstrans_unmark(cookie);
+ if (error)
+ return (error);
+
+ error = generic_file_mmap(filp, vma);
+ if (error)
+ return (error);
+
+ mutex_enter(&zp->z_lock);
+ zp->z_is_mapped = B_TRUE;
+ mutex_exit(&zp->z_lock);
+
+ return (error);
+}
+
+/*
+ * Populate a page with data for the Linux page cache. This function is
+ * only used to support mmap(2). There will be an identical copy of the
+ * data in the ARC which is kept up to date via .write() and .writepage().
+ *
+ * Current this function relies on zpl_read_common() and the O_DIRECT
+ * flag to read in a page. This works but the more correct way is to
+ * update zfs_fillpage() to be Linux friendly and use that interface.
+ */
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+ struct inode *ip;
+ struct page *pl[1];
+ int error = 0;
+ fstrans_cookie_t cookie;
+
+ ASSERT(PageLocked(pp));
+ ip = pp->mapping->host;
+ pl[0] = pp;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_getpage(ip, pl, 1);
+ spl_fstrans_unmark(cookie);
+
+ if (error) {
+ SetPageError(pp);
+ ClearPageUptodate(pp);
+ } else {
+ ClearPageError(pp);
+ SetPageUptodate(pp);
+ flush_dcache_page(pp);
+ }
+
+ unlock_page(pp);
+ return (error);
+}
+
+/*
+ * Populate a set of pages with data for the Linux page cache. This
+ * function will only be called for read ahead and never for demand
+ * paging. For simplicity, the code relies on read_cache_pages() to
+ * correctly lock each page for IO and call zpl_readpage().
+ */
+static int
+zpl_readpages(struct file *filp, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return (read_cache_pages(mapping, pages,
+ (filler_t *)zpl_readpage, filp));
+}
+
+int
+zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
+{
+ struct address_space *mapping = data;
+ fstrans_cookie_t cookie;
+
+ ASSERT(PageLocked(pp));
+ ASSERT(!PageWriteback(pp));
+
+ cookie = spl_fstrans_mark();
+ (void) zfs_putpage(mapping->host, pp, wbc);
+ spl_fstrans_unmark(cookie);
+
+ return (0);
+}
+
+static int
+zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ znode_t *zp = ITOZ(mapping->host);
+ zfsvfs_t *zfsvfs = ITOZSB(mapping->host);
+ enum writeback_sync_modes sync_mode;
+ int result;
+
+ ZFS_ENTER(zfsvfs);
+ if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ wbc->sync_mode = WB_SYNC_ALL;
+ ZFS_EXIT(zfsvfs);
+ sync_mode = wbc->sync_mode;
+
+ /*
+ * We don't want to run write_cache_pages() in SYNC mode here, because
+ * that would make putpage() wait for a single page to be committed to
+ * disk every single time, resulting in atrocious performance. Instead
+ * we run it once in non-SYNC mode so that the ZIL gets all the data,
+ * and then we commit it all in one go.
+ */
+ wbc->sync_mode = WB_SYNC_NONE;
+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ if (sync_mode != wbc->sync_mode) {
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+ if (zfsvfs->z_log != NULL)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+
+ /*
+ * We need to call write_cache_pages() again (we can't just
+ * return after the commit) because the previous call in
+ * non-SYNC mode does not guarantee that we got all the dirty
+ * pages (see the implementation of write_cache_pages() for
+ * details). That being said, this is a no-op in most cases.
+ */
+ wbc->sync_mode = sync_mode;
+ result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+ }
+ return (result);
+}
+
+/*
+ * Write out dirty pages to the ARC, this function is only required to
+ * support mmap(2). Mapped pages may be dirtied by memory operations
+ * which never call .write(). These dirty pages are kept in sync with
+ * the ARC buffers via this hook.
+ */
+static int
+zpl_writepage(struct page *pp, struct writeback_control *wbc)
+{
+ if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ wbc->sync_mode = WB_SYNC_ALL;
+
+ return (zpl_putpage(pp, wbc, pp->mapping));
+}
+
+/*
+ * The only flag combination which matches the behavior of zfs_space()
+ * is FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE
+ * flag was introduced in the 2.6.38 kernel.
+ */
+#if defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE)
+long
+zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
+{
+ int error = -EOPNOTSUPP;
+
+#if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)
+ cred_t *cr = CRED();
+ flock64_t bf;
+ loff_t olen;
+ fstrans_cookie_t cookie;
+
+ if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return (error);
+
+ if (offset < 0 || len <= 0)
+ return (-EINVAL);
+
+ spl_inode_lock(ip);
+ olen = i_size_read(ip);
+
+ if (offset > olen) {
+ spl_inode_unlock(ip);
+ return (0);
+ }
+ if (offset + len > olen)
+ len = olen - offset;
+ bf.l_type = F_WRLCK;
+ bf.l_whence = SEEK_SET;
+ bf.l_start = offset;
+ bf.l_len = len;
+ bf.l_pid = 0;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_space(ip, F_FREESP, &bf, FWRITE, offset, cr);
+ spl_fstrans_unmark(cookie);
+ spl_inode_unlock(ip);
+
+ crfree(cr);
+#endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */
+
+ ASSERT3S(error, <=, 0);
+ return (error);
+}
+#endif /* defined(HAVE_FILE_FALLOCATE) || defined(HAVE_INODE_FALLOCATE) */
+
+#ifdef HAVE_FILE_FALLOCATE
+static long
+zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
+{
+ return zpl_fallocate_common(file_inode(filp),
+ mode, offset, len);
+}
+#endif /* HAVE_FILE_FALLOCATE */
+
+#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
+#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+
+static uint32_t
+__zpl_ioctl_getflags(struct inode *ip)
+{
+ uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+ uint32_t ioctl_flags = 0;
+
+ if (zfs_flags & ZFS_IMMUTABLE)
+ ioctl_flags |= FS_IMMUTABLE_FL;
+
+ if (zfs_flags & ZFS_APPENDONLY)
+ ioctl_flags |= FS_APPEND_FL;
+
+ if (zfs_flags & ZFS_NODUMP)
+ ioctl_flags |= FS_NODUMP_FL;
+
+ if (zfs_flags & ZFS_PROJINHERIT)
+ ioctl_flags |= ZFS_PROJINHERIT_FL;
+
+ return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+}
+
+/*
+ * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
+ * attributes common to both Linux and Solaris are mapped.
+ */
+static int
+zpl_ioctl_getflags(struct file *filp, void __user *arg)
+{
+ uint32_t flags;
+ int err;
+
+ flags = __zpl_ioctl_getflags(file_inode(filp));
+ err = copy_to_user(arg, &flags, sizeof (flags));
+
+ return (err);
+}
+
+/*
+ * fchange() is a helper macro to detect if we have been asked to change a
+ * flag. This is ugly, but the requirement that we do this is a consequence of
+ * how the Linux file attribute interface was designed. Another consequence is
+ * that concurrent modification of files suffers from a TOCTOU race. Neither
+ * are things we can fix without modifying the kernel-userland interface, which
+ * is outside of our jurisdiction.
+ */
+
+#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
+
+static int
+__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+ uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+ xoptattr_t *xoap;
+
+ if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
+ ZFS_PROJINHERIT_FL))
+ return (-EOPNOTSUPP);
+
+ if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
+ return (-EACCES);
+
+ if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
+ fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return (-EACCES);
+
+ if (!zpl_inode_owner_or_capable(ip))
+ return (-EACCES);
+
+ xva_init(xva);
+ xoap = xva_getxoptattr(xva);
+
+ XVA_SET_REQ(xva, XAT_IMMUTABLE);
+ if (ioctl_flags & FS_IMMUTABLE_FL)
+ xoap->xoa_immutable = B_TRUE;
+
+ XVA_SET_REQ(xva, XAT_APPENDONLY);
+ if (ioctl_flags & FS_APPEND_FL)
+ xoap->xoa_appendonly = B_TRUE;
+
+ XVA_SET_REQ(xva, XAT_NODUMP);
+ if (ioctl_flags & FS_NODUMP_FL)
+ xoap->xoa_nodump = B_TRUE;
+
+ XVA_SET_REQ(xva, XAT_PROJINHERIT);
+ if (ioctl_flags & ZFS_PROJINHERIT_FL)
+ xoap->xoa_projinherit = B_TRUE;
+
+ return (0);
+}
+
+static int
+zpl_ioctl_setflags(struct file *filp, void __user *arg)
+{
+ struct inode *ip = file_inode(filp);
+ uint32_t flags;
+ cred_t *cr = CRED();
+ xvattr_t xva;
+ int err;
+ fstrans_cookie_t cookie;
+
+ if (copy_from_user(&flags, arg, sizeof (flags)))
+ return (-EFAULT);
+
+ err = __zpl_ioctl_setflags(ip, flags, &xva);
+ if (err)
+ return (err);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (err);
+}
+
+static int
+zpl_ioctl_getxattr(struct file *filp, void __user *arg)
+{
+ zfsxattr_t fsx = { 0 };
+ struct inode *ip = file_inode(filp);
+ int err;
+
+ fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+ fsx.fsx_projid = ITOZ(ip)->z_projid;
+ err = copy_to_user(arg, &fsx, sizeof (fsx));
+
+ return (err);
+}
+
+static int
+zpl_ioctl_setxattr(struct file *filp, void __user *arg)
+{
+ struct inode *ip = file_inode(filp);
+ zfsxattr_t fsx;
+ cred_t *cr = CRED();
+ xvattr_t xva;
+ xoptattr_t *xoap;
+ int err;
+ fstrans_cookie_t cookie;
+
+ if (copy_from_user(&fsx, arg, sizeof (fsx)))
+ return (-EFAULT);
+
+ if (!zpl_is_valid_projid(fsx.fsx_projid))
+ return (-EINVAL);
+
+ err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+ if (err)
+ return (err);
+
+ xoap = xva_getxoptattr(&xva);
+ XVA_SET_REQ(&xva, XAT_PROJID);
+ xoap->xoa_projid = fsx.fsx_projid;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ err = -zfs_setattr(ip, (vattr_t *)&xva, 0, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (err);
+}
+
+static long
+zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ return (zpl_ioctl_getflags(filp, (void *)arg));
+ case FS_IOC_SETFLAGS:
+ return (zpl_ioctl_setflags(filp, (void *)arg));
+ case ZFS_IOC_FSGETXATTR:
+ return (zpl_ioctl_getxattr(filp, (void *)arg));
+ case ZFS_IOC_FSSETXATTR:
+ return (zpl_ioctl_setxattr(filp, (void *)arg));
+ default:
+ return (-ENOTTY);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ default:
+ return (-ENOTTY);
+ }
+ return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
+}
+#endif /* CONFIG_COMPAT */
+
+
+const struct address_space_operations zpl_address_space_operations = {
+ .readpages = zpl_readpages,
+ .readpage = zpl_readpage,
+ .writepage = zpl_writepage,
+ .writepages = zpl_writepages,
+ .direct_IO = zpl_direct_IO,
+};
+
+const struct file_operations zpl_file_operations = {
+ .open = zpl_open,
+ .release = zpl_release,
+ .llseek = zpl_llseek,
+#ifdef HAVE_VFS_RW_ITERATE
+#ifdef HAVE_NEW_SYNC_READ
+ .read = new_sync_read,
+ .write = new_sync_write,
+#endif
+ .read_iter = zpl_iter_read,
+ .write_iter = zpl_iter_write,
+#else
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = zpl_aio_read,
+ .aio_write = zpl_aio_write,
+#endif
+ .mmap = zpl_mmap,
+ .fsync = zpl_fsync,
+#ifdef HAVE_FILE_AIO_FSYNC
+ .aio_fsync = zpl_aio_fsync,
+#endif
+#ifdef HAVE_FILE_FALLOCATE
+ .fallocate = zpl_fallocate,
+#endif /* HAVE_FILE_FALLOCATE */
+ .unlocked_ioctl = zpl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = zpl_compat_ioctl,
+#endif
+};
+
+const struct file_operations zpl_dir_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+#if defined(HAVE_VFS_ITERATE_SHARED)
+ .iterate_shared = zpl_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+ .iterate = zpl_iterate,
+#else
+ .readdir = zpl_readdir,
+#endif
+ .fsync = zpl_fsync,
+ .unlocked_ioctl = zpl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = zpl_compat_ioctl,
+#endif
+};
diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c
new file mode 100644
index 000000000..3f3b2e2dc
--- /dev/null
+++ b/module/os/linux/zfs/zpl_inode.c
@@ -0,0 +1,826 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/dmu_objset.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+#include <sys/file.h>
+
+
+static struct dentry *
+#ifdef HAVE_LOOKUP_NAMEIDATA
+zpl_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+#else
+zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+#endif
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ int error;
+ fstrans_cookie_t cookie;
+ pathname_t *ppn = NULL;
+ pathname_t pn;
+ int zfs_flags = 0;
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+ if (dlen(dentry) >= ZAP_MAXNAMELEN)
+ return (ERR_PTR(-ENAMETOOLONG));
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+
+ /* If we are a case insensitive fs, we need the real name */
+ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+ zfs_flags = FIGNORECASE;
+ pn_alloc(&pn);
+ ppn = &pn;
+ }
+
+ error = -zfs_lookup(dir, dname(dentry), &ip, zfs_flags, cr, NULL, ppn);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+ crfree(cr);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_time = jiffies;
+#ifndef HAVE_S_D_OP
+ d_set_d_op(dentry, &zpl_dentry_operations);
+#endif /* HAVE_S_D_OP */
+ spin_unlock(&dentry->d_lock);
+
+ if (error) {
+ /*
+ * If we have a case sensitive fs, we do not want to
+ * insert negative entries, so return NULL for ENOENT.
+ * Fall through if the error is not ENOENT. Also free memory.
+ */
+ if (ppn) {
+ pn_free(ppn);
+ if (error == -ENOENT)
+ return (NULL);
+ }
+
+ if (error == -ENOENT)
+ return (d_splice_alias(NULL, dentry));
+ else
+ return (ERR_PTR(error));
+ }
+
+ /*
+ * If we are case insensitive, call the correct function
+ * to install the name.
+ */
+ if (ppn) {
+ struct dentry *new_dentry;
+ struct qstr ci_name;
+
+ if (strcmp(dname(dentry), pn.pn_buf) == 0) {
+ new_dentry = d_splice_alias(ip, dentry);
+ } else {
+ ci_name.name = pn.pn_buf;
+ ci_name.len = strlen(pn.pn_buf);
+ new_dentry = d_add_ci(dentry, ip, &ci_name);
+ }
+ pn_free(ppn);
+ return (new_dentry);
+ } else {
+ return (d_splice_alias(ip, dentry));
+ }
+}
+
+void
+zpl_vap_init(vattr_t *vap, struct inode *dir, zpl_umode_t mode, cred_t *cr)
+{
+ vap->va_mask = ATTR_MODE;
+ vap->va_mode = mode;
+ vap->va_uid = crgetfsuid(cr);
+
+ if (dir && dir->i_mode & S_ISGID) {
+ vap->va_gid = KGID_TO_SGID(dir->i_gid);
+ if (S_ISDIR(mode))
+ vap->va_mode |= S_ISGID;
+ } else {
+ vap->va_gid = crgetfsgid(cr);
+ }
+}
+
+static int
+#ifdef HAVE_CREATE_NAMEIDATA
+zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
+ struct nameidata *nd)
+#else
+zpl_create(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
+ bool flag)
+#endif
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
+ if (error == 0) {
+ d_instantiate(dentry, ip);
+
+ error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ip, dir);
+
+ if (error)
+ (void) zfs_remove(dir, dname(dentry), cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_mknod(struct inode *dir, struct dentry *dentry, zpl_umode_t mode,
+ dev_t rdev)
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ /*
+ * We currently expect Linux to supply rdev=0 for all sockets
+ * and fifos, but we want to know if this behavior ever changes.
+ */
+ if (S_ISSOCK(mode) || S_ISFIFO(mode))
+ ASSERT(rdev == 0);
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode, cr);
+ vap->va_rdev = rdev;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_create(dir, dname(dentry), vap, 0, mode, &ip, cr, 0, NULL);
+ if (error == 0) {
+ d_instantiate(dentry, ip);
+
+ error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ip, dir);
+
+ if (error)
+ (void) zfs_remove(dir, dname(dentry), cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_TMPFILE
+static int
+zpl_tmpfile(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
+{
+ cred_t *cr = CRED();
+ struct inode *ip;
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
+ if (error == 0) {
+ /* d_tmpfile will do drop_nlink, so we should set it first */
+ set_nlink(ip, 1);
+ d_tmpfile(dentry, ip);
+
+ error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ip, dir);
+ /*
+ * don't need to handle error here, file is already in
+ * unlinked set.
+ */
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+#endif
+
+static int
+zpl_unlink(struct inode *dir, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_remove(dir, dname(dentry), cr, 0);
+
+ /*
+ * For a CI FS we must invalidate the dentry to prevent the
+ * creation of negative entries.
+ */
+ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+ d_invalidate(dentry);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_mkdir(struct inode *dir, struct dentry *dentry, zpl_umode_t mode)
+{
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ struct inode *ip;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_mkdir(dir, dname(dentry), vap, &ip, cr, 0, NULL);
+ if (error == 0) {
+ d_instantiate(dentry, ip);
+
+ error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+ if (error == 0)
+ error = zpl_init_acl(ip, dir);
+
+ if (error)
+ (void) zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_rmdir(dir, dname(dentry), NULL, cr, 0);
+
+ /*
+ * For a CI FS we must invalidate the dentry to prevent the
+ * creation of negative entries.
+ */
+ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+ d_invalidate(dentry);
+
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ int error;
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+
+ /*
+ * XXX request_mask and query_flags currently ignored.
+ */
+
+ error = -zfs_getattr_fast(path->dentry->d_inode, stat);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_getattr);
+
+static int
+zpl_setattr(struct dentry *dentry, struct iattr *ia)
+{
+ struct inode *ip = dentry->d_inode;
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ int error;
+ fstrans_cookie_t cookie;
+
+ error = setattr_prepare(dentry, ia);
+ if (error)
+ return (error);
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
+ vap->va_mode = ia->ia_mode;
+ vap->va_uid = KUID_TO_SUID(ia->ia_uid);
+ vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+ vap->va_size = ia->ia_size;
+ vap->va_atime = ia->ia_atime;
+ vap->va_mtime = ia->ia_mtime;
+ vap->va_ctime = ia->ia_ctime;
+
+ if (vap->va_mask & ATTR_ATIME) {
+ ip->i_atime = zpl_inode_timespec_trunc(ia->ia_atime,
+ ip->i_sb->s_time_gran);
+ }
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_setattr(ip, vap, 0, cr);
+ if (!error && (ia->ia_valid & ATTR_MODE))
+ error = zpl_chmod_acl(ip);
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_rename2(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+ cred_t *cr = CRED();
+ int error;
+ fstrans_cookie_t cookie;
+
+ /* We don't have renameat2(2) support */
+ if (flags)
+ return (-EINVAL);
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_rename(struct inode *sdip, struct dentry *sdentry,
+ struct inode *tdip, struct dentry *tdentry)
+{
+ return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+ cred_t *cr = CRED();
+ vattr_t *vap;
+ struct inode *ip;
+ int error;
+ fstrans_cookie_t cookie;
+
+ crhold(cr);
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_symlink(dir, dname(dentry), vap, (char *)name, &ip, cr, 0);
+ if (error == 0) {
+ d_instantiate(dentry, ip);
+
+ error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+ if (error)
+ (void) zfs_remove(dir, dname(dentry), cr, 0);
+ }
+
+ spl_fstrans_unmark(cookie);
+ kmem_free(vap, sizeof (vattr_t));
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#if defined(HAVE_PUT_LINK_COOKIE)
+static void
+zpl_put_link(struct inode *unused, void *cookie)
+{
+ kmem_free(cookie, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_NAMEIDATA)
+static void
+zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+ const char *link = nd_get_link(nd);
+
+ if (!IS_ERR(link))
+ kmem_free(link, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_DELAYED)
+static void
+zpl_put_link(void *ptr)
+{
+ kmem_free(ptr, MAXPATHLEN);
+}
+#endif
+
+static int
+zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ struct iovec iov;
+ uio_t uio = { { 0 }, 0 };
+ int error;
+
+ crhold(cr);
+ *link = NULL;
+ iov.iov_len = MAXPATHLEN;
+ iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_resid = (MAXPATHLEN - 1);
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_readlink(ip, &uio, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ if (error)
+ kmem_free(iov.iov_base, MAXPATHLEN);
+ else
+ *link = iov.iov_base;
+
+ return (error);
+}
+
+#if defined(HAVE_GET_LINK_DELAYED)
+const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *done)
+{
+ char *link = NULL;
+ int error;
+
+ if (!dentry)
+ return (ERR_PTR(-ECHILD));
+
+ error = zpl_get_link_common(dentry, inode, &link);
+ if (error)
+ return (ERR_PTR(error));
+
+ set_delayed_call(done, zpl_put_link, link);
+
+ return (link);
+}
+#elif defined(HAVE_GET_LINK_COOKIE)
+const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie)
+{
+ char *link = NULL;
+ int error;
+
+ if (!dentry)
+ return (ERR_PTR(-ECHILD));
+
+ error = zpl_get_link_common(dentry, inode, &link);
+ if (error)
+ return (ERR_PTR(error));
+
+ return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_COOKIE)
+const char *
+zpl_follow_link(struct dentry *dentry, void **cookie)
+{
+ char *link = NULL;
+ int error;
+
+ error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+ if (error)
+ return (ERR_PTR(error));
+
+ return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+static void *
+zpl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ char *link = NULL;
+ int error;
+
+ error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+ if (error)
+ nd_set_link(nd, ERR_PTR(error));
+ else
+ nd_set_link(nd, link);
+
+ return (NULL);
+}
+#endif
+
+static int
+zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+ cred_t *cr = CRED();
+ struct inode *ip = old_dentry->d_inode;
+ int error;
+ fstrans_cookie_t cookie;
+
+ if (ip->i_nlink >= ZFS_LINK_MAX)
+ return (-EMLINK);
+
+ crhold(cr);
+ ip->i_ctime = current_time(ip);
+ igrab(ip); /* Use ihold() if available */
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_link(dir, ip, dname(dentry), cr, 0);
+ if (error) {
+ iput(ip);
+ goto out;
+ }
+
+ d_instantiate(dentry, ip);
+out:
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+static void
+zpl_truncate_range(struct inode *ip, loff_t start, loff_t end)
+{
+ cred_t *cr = CRED();
+ flock64_t bf;
+ fstrans_cookie_t cookie;
+
+ ASSERT3S(start, <=, end);
+
+ /*
+ * zfs_freesp() will interpret (len == 0) as meaning "truncate until
+ * the end of the file". We don't want that.
+ */
+ if (start == end)
+ return;
+
+ crhold(cr);
+
+ bf.l_type = F_WRLCK;
+ bf.l_whence = SEEK_SET;
+ bf.l_start = start;
+ bf.l_len = end - start;
+ bf.l_pid = 0;
+ cookie = spl_fstrans_mark();
+ zfs_space(ip, F_FREESP, &bf, FWRITE, start, cr);
+ spl_fstrans_unmark(cookie);
+
+ crfree(cr);
+}
+#endif /* HAVE_INODE_TRUNCATE_RANGE */
+
+#ifdef HAVE_INODE_FALLOCATE
+static long
+zpl_fallocate(struct inode *ip, int mode, loff_t offset, loff_t len)
+{
+ return (zpl_fallocate_common(ip, mode, offset, len));
+}
+#endif /* HAVE_INODE_FALLOCATE */
+
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned int flags = (nd ? nd->flags : 0);
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+ /* CSTYLED */
+ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+ int error;
+
+ if (flags & LOOKUP_RCU)
+ return (-ECHILD);
+
+ /*
+ * Automounted snapshots rely on periodic dentry revalidation
+ * to defer snapshots from being automatically unmounted.
+ */
+ if (zfsvfs->z_issnap) {
+ if (time_after(jiffies, zfsvfs->z_snap_defer_time +
+ MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+ zfsvfs->z_snap_defer_time = jiffies;
+ zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa,
+ dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot);
+ }
+ }
+
+ /*
+ * After a rollback negative dentries created before the rollback
+ * time must be invalidated. Otherwise they can obscure files which
+ * are only present in the rolled back dataset.
+ */
+ if (dentry->d_inode == NULL) {
+ spin_lock(&dentry->d_lock);
+ error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
+ spin_unlock(&dentry->d_lock);
+
+ if (error)
+ return (0);
+ }
+
+ /*
+ * The dentry may reference a stale inode if a mounted file system
+ * was rolled back to a point in time where the object didn't exist.
+ */
+ if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+ return (0);
+
+ return (1);
+}
+
+const struct inode_operations zpl_inode_operations = {
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+#ifdef HAVE_INODE_TRUNCATE_RANGE
+ .truncate_range = zpl_truncate_range,
+#endif /* HAVE_INODE_TRUNCATE_RANGE */
+#ifdef HAVE_INODE_FALLOCATE
+ .fallocate = zpl_fallocate,
+#endif /* HAVE_INODE_FALLOCATE */
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+ .set_acl = zpl_set_acl,
+#endif
+#if defined(HAVE_GET_ACL)
+ .get_acl = zpl_get_acl,
+#elif defined(HAVE_CHECK_ACL)
+ .check_acl = zpl_check_acl,
+#elif defined(HAVE_PERMISSION)
+ .permission = zpl_permission,
+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_dir_inode_operations = {
+ .create = zpl_create,
+ .lookup = zpl_lookup,
+ .link = zpl_link,
+ .unlink = zpl_unlink,
+ .symlink = zpl_symlink,
+ .mkdir = zpl_mkdir,
+ .rmdir = zpl_rmdir,
+ .mknod = zpl_mknod,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+ .rename = zpl_rename2,
+#else
+ .rename = zpl_rename,
+#endif
+#ifdef HAVE_TMPFILE
+ .tmpfile = zpl_tmpfile,
+#endif
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+ .set_acl = zpl_set_acl,
+#endif
+#if defined(HAVE_GET_ACL)
+ .get_acl = zpl_get_acl,
+#elif defined(HAVE_CHECK_ACL)
+ .check_acl = zpl_check_acl,
+#elif defined(HAVE_PERMISSION)
+ .permission = zpl_permission,
+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_symlink_inode_operations = {
+#ifdef HAVE_GENERIC_READLINK
+ .readlink = generic_readlink,
+#endif
+#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
+ .get_link = zpl_get_link,
+#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+ .follow_link = zpl_follow_link,
+#endif
+#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
+ .put_link = zpl_put_link,
+#endif
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+};
+
+const struct inode_operations zpl_special_inode_operations = {
+ .setattr = zpl_setattr,
+ .getattr = zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+#endif
+ .listxattr = zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+ .set_acl = zpl_set_acl,
+#endif
+#if defined(HAVE_GET_ACL)
+ .get_acl = zpl_get_acl,
+#elif defined(HAVE_CHECK_ACL)
+ .check_acl = zpl_check_acl,
+#elif defined(HAVE_PERMISSION)
+ .permission = zpl_permission,
+#endif /* HAVE_GET_ACL | HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+dentry_operations_t zpl_dentry_operations = {
+ .d_revalidate = zpl_revalidate,
+};
diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c
new file mode 100644
index 000000000..810ab2898
--- /dev/null
+++ b/module/os/linux/zfs/zpl_super.c
@@ -0,0 +1,426 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ */
+
+
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static struct inode *
+zpl_inode_alloc(struct super_block *sb)
+{
+ struct inode *ip;
+
+ VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
+ inode_set_iversion(ip, 1);
+
+ return (ip);
+}
+
+static void
+zpl_inode_destroy(struct inode *ip)
+{
+ ASSERT(atomic_read(&ip->i_count) == 0);
+ zfs_inode_destroy(ip);
+}
+
+/*
+ * Called from __mark_inode_dirty() to reflect that something in the
+ * inode has changed. We use it to ensure the znode system attributes
+ * are always strictly update to date with respect to the inode.
+ */
+#ifdef HAVE_DIRTY_INODE_WITH_FLAGS
+static void
+zpl_dirty_inode(struct inode *ip, int flags)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ zfs_dirty_inode(ip, flags);
+ spl_fstrans_unmark(cookie);
+}
+#else
+static void
+zpl_dirty_inode(struct inode *ip)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ zfs_dirty_inode(ip, 0);
+ spl_fstrans_unmark(cookie);
+}
+#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */
+
+/*
+ * When ->drop_inode() is called its return value indicates if the
+ * inode should be evicted from the inode cache. If the inode is
+ * unhashed and has no links the default policy is to evict it
+ * immediately.
+ *
+ * Prior to 2.6.36 this eviction was accomplished by the vfs calling
+ * ->delete_inode(). It was ->delete_inode()'s responsibility to
+ * truncate the inode pages and call clear_inode(). The call to
+ * clear_inode() synchronously invalidates all the buffers and
+ * calls ->clear_inode(). It was ->clear_inode()'s responsibility
+ * to cleanup and filesystem specific data before freeing the inode.
+ *
+ * This elaborate mechanism was replaced by ->evict_inode() which
+ * does the job of both ->delete_inode() and ->clear_inode(). It
+ * will be called exactly once, and when it returns the inode must
+ * be in a state where it can simply be freed.i
+ *
+ * The ->evict_inode() callback must minimally truncate the inode pages,
+ * and call clear_inode(). For 2.6.35 and later kernels this will
+ * simply update the inode state, with the sync occurring before the
+ * truncate in evict(). For earlier kernels clear_inode() maps to
+ * end_writeback() which is responsible for completing all outstanding
+ * write back. In either case, once this is done it is safe to cleanup
+ * any remaining inode specific data via zfs_inactive().
+ * remaining filesystem specific data.
+ */
+#ifdef HAVE_EVICT_INODE
+static void
+zpl_evict_inode(struct inode *ip)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ truncate_setsize(ip, 0);
+ clear_inode(ip);
+ zfs_inactive(ip);
+ spl_fstrans_unmark(cookie);
+}
+
+#else
+
+static void
+zpl_drop_inode(struct inode *ip)
+{
+ generic_delete_inode(ip);
+}
+
+static void
+zpl_clear_inode(struct inode *ip)
+{
+ fstrans_cookie_t cookie;
+
+ cookie = spl_fstrans_mark();
+ zfs_inactive(ip);
+ spl_fstrans_unmark(cookie);
+}
+
+static void
+zpl_inode_delete(struct inode *ip)
+{
+ truncate_setsize(ip, 0);
+ clear_inode(ip);
+}
+#endif /* HAVE_EVICT_INODE */
+
+static void
+zpl_put_super(struct super_block *sb)
+{
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_umount(sb);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+}
+
+static int
+zpl_sync_fs(struct super_block *sb, int wait)
+{
+ fstrans_cookie_t cookie;
+ cred_t *cr = CRED();
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ error = -zfs_sync(sb, wait, cr);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
+{
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_statvfs(dentry, statp);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ /*
+ * If required by a 32-bit system call, dynamically scale the
+ * block size up to 16MiB and decrease the block counts. This
+ * allows for a maximum size of 64EiB to be reported. The file
+ * counts must be artificially capped at 2^32-1.
+ */
+ if (unlikely(zpl_is_32bit_api())) {
+ while (statp->f_blocks > UINT32_MAX &&
+ statp->f_bsize < SPA_MAXBLOCKSIZE) {
+ statp->f_frsize <<= 1;
+ statp->f_bsize <<= 1;
+
+ statp->f_blocks >>= 1;
+ statp->f_bfree >>= 1;
+ statp->f_bavail >>= 1;
+ }
+
+ uint64_t usedobjs = statp->f_files - statp->f_ffree;
+ statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
+ statp->f_files = statp->f_ffree + usedobjs;
+ }
+
+ return (error);
+}
+
+static int
+zpl_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_remount(sb, flags, &zm);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+ seq_printf(seq, ",%s",
+ zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
+
+#ifdef CONFIG_FS_POSIX_ACL
+ switch (zfsvfs->z_acl_type) {
+ case ZFS_ACLTYPE_POSIXACL:
+ seq_puts(seq, ",posixacl");
+ break;
+ default:
+ seq_puts(seq, ",noacl");
+ break;
+ }
+#endif /* CONFIG_FS_POSIX_ACL */
+
+ return (0);
+}
+
+#ifdef HAVE_SHOW_OPTIONS_WITH_DENTRY
+static int
+zpl_show_options(struct seq_file *seq, struct dentry *root)
+{
+ return (__zpl_show_options(seq, root->d_sb->s_fs_info));
+}
+#else
+static int
+zpl_show_options(struct seq_file *seq, struct vfsmount *vfsp)
+{
+ return (__zpl_show_options(seq, vfsp->mnt_sb->s_fs_info));
+}
+#endif /* HAVE_SHOW_OPTIONS_WITH_DENTRY */
+
+static int
+zpl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ zfs_mnt_t *zm = (zfs_mnt_t *)data;
+ fstrans_cookie_t cookie;
+ int error;
+
+ cookie = spl_fstrans_mark();
+ error = -zfs_domount(sb, zm, silent);
+ spl_fstrans_unmark(cookie);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_test_super(struct super_block *s, void *data)
+{
+ zfsvfs_t *zfsvfs = s->s_fs_info;
+ objset_t *os = data;
+
+ if (zfsvfs == NULL)
+ return (0);
+
+ return (os == zfsvfs->z_os);
+}
+
+static struct super_block *
+zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+{
+ struct super_block *s;
+ objset_t *os;
+ int err;
+
+ err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
+ if (err)
+ return (ERR_PTR(-err));
+
+ /*
+ * The dsl pool lock must be released prior to calling sget().
+ * It is possible sget() may block on the lock in grab_super()
+ * while deactivate_super() holds that same lock and waits for
+ * a txg sync. If the dsl_pool lock is held over sget()
+ * this can prevent the pool sync and cause a deadlock.
+ */
+ dsl_pool_rele(dmu_objset_pool(os), FTAG);
+ s = zpl_sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+ dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+ if (IS_ERR(s))
+ return (ERR_CAST(s));
+
+ if (s->s_root == NULL) {
+ err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
+ if (err) {
+ deactivate_locked_super(s);
+ return (ERR_PTR(err));
+ }
+ s->s_flags |= SB_ACTIVE;
+ } else if ((flags ^ s->s_flags) & SB_RDONLY) {
+ deactivate_locked_super(s);
+ return (ERR_PTR(-EBUSY));
+ }
+
+ return (s);
+}
+
+#ifdef HAVE_FST_MOUNT
+static struct dentry *
+zpl_mount(struct file_system_type *fs_type, int flags,
+ const char *osname, void *data)
+{
+ zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+ struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+ if (IS_ERR(sb))
+ return (ERR_CAST(sb));
+
+ return (dget(sb->s_root));
+}
+#else
+static int
+zpl_get_sb(struct file_system_type *fs_type, int flags,
+ const char *osname, void *data, struct vfsmount *mnt)
+{
+ zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+ struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+ if (IS_ERR(sb))
+ return (PTR_ERR(sb));
+
+ (void) simple_set_mnt(mnt, sb);
+
+ return (0);
+}
+#endif /* HAVE_FST_MOUNT */
+
+static void
+zpl_kill_sb(struct super_block *sb)
+{
+ zfs_preumount(sb);
+ kill_anon_super(sb);
+
+#ifdef HAVE_S_INSTANCES_LIST_HEAD
+ sb->s_instances.next = &(zpl_fs_type.fs_supers);
+#endif /* HAVE_S_INSTANCES_LIST_HEAD */
+}
+
+void
+zpl_prune_sb(int64_t nr_to_scan, void *arg)
+{
+ struct super_block *sb = (struct super_block *)arg;
+ int objects = 0;
+
+ (void) -zfs_prune(sb, nr_to_scan, &objects);
+}
+
+#ifdef HAVE_NR_CACHED_OBJECTS
+static int
+zpl_nr_cached_objects(struct super_block *sb)
+{
+ return (0);
+}
+#endif /* HAVE_NR_CACHED_OBJECTS */
+
+#ifdef HAVE_FREE_CACHED_OBJECTS
+static void
+zpl_free_cached_objects(struct super_block *sb, int nr_to_scan)
+{
+ /* noop */
+}
+#endif /* HAVE_FREE_CACHED_OBJECTS */
+
+const struct super_operations zpl_super_operations = {
+ .alloc_inode = zpl_inode_alloc,
+ .destroy_inode = zpl_inode_destroy,
+ .dirty_inode = zpl_dirty_inode,
+ .write_inode = NULL,
+#ifdef HAVE_EVICT_INODE
+ .evict_inode = zpl_evict_inode,
+#else
+ .drop_inode = zpl_drop_inode,
+ .clear_inode = zpl_clear_inode,
+ .delete_inode = zpl_inode_delete,
+#endif /* HAVE_EVICT_INODE */
+ .put_super = zpl_put_super,
+ .sync_fs = zpl_sync_fs,
+ .statfs = zpl_statfs,
+ .remount_fs = zpl_remount_fs,
+ .show_options = zpl_show_options,
+ .show_stats = NULL,
+#ifdef HAVE_NR_CACHED_OBJECTS
+ .nr_cached_objects = zpl_nr_cached_objects,
+#endif /* HAVE_NR_CACHED_OBJECTS */
+#ifdef HAVE_FREE_CACHED_OBJECTS
+ .free_cached_objects = zpl_free_cached_objects,
+#endif /* HAVE_FREE_CACHED_OBJECTS */
+};
+
+struct file_system_type zpl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = ZFS_DRIVER,
+#ifdef HAVE_FST_MOUNT
+ .mount = zpl_mount,
+#else
+ .get_sb = zpl_get_sb,
+#endif /* HAVE_FST_MOUNT */
+ .kill_sb = zpl_kill_sb,
+};
diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c
new file mode 100644
index 000000000..95523f28e
--- /dev/null
+++ b/module/os/linux/zfs/zpl_xattr.c
@@ -0,0 +1,1548 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ *
+ * Extended attributes (xattr) on Solaris are implemented as files
+ * which exist in a hidden xattr directory. These extended attributes
+ * can be accessed using the attropen() system call which opens
+ * the extended attribute. It can then be manipulated just like
+ * a standard file descriptor. This has a couple advantages such
+ * as practically no size limit on the file, and the extended
+ * attributes permissions may differ from those of the parent file.
+ * This interface is really quite clever, but it's also completely
+ * different than what is supported on Linux. It also comes with a
+ * steep performance penalty when accessing small xattrs because they
+ * are not stored with the parent file.
+ *
+ * Under Linux extended attributes are manipulated by the system
+ * calls getxattr(2), setxattr(2), and listxattr(2). They consider
+ * extended attributes to be name/value pairs where the name is a
+ * NULL terminated string. The name must also include one of the
+ * following namespace prefixes:
+ *
+ * user - No restrictions and is available to user applications.
+ * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use.
+ * system - Used for access control lists (system.nfs4_acl, etc).
+ * security - Used by SELinux to store a files security context.
+ *
+ * The value under Linux to limited to 65536 bytes of binary data.
+ * In practice, individual xattrs tend to be much smaller than this
+ * and are typically less than 100 bytes. A good example of this
+ * are the security.selinux xattrs which are less than 100 bytes and
+ * exist for every file when xattr labeling is enabled.
+ *
+ * The Linux xattr implementation has been written to take advantage of
+ * this typical usage. When the dataset property 'xattr=sa' is set,
+ * then xattrs will be preferentially stored as System Attributes (SA).
+ * This allows tiny xattrs (~100 bytes) to be stored with the dnode and
+ * up to 64k of xattrs to be stored in the spill block. If additional
+ * xattr space is required, which is unlikely under Linux, they will
+ * be stored using the traditional directory approach.
+ *
+ * This optimization results in roughly a 3x performance improvement
+ * when accessing xattrs because it avoids the need to perform a seek
+ * for every xattr value. When multiple xattrs are stored per-file
+ * the performance improvements are even greater because all of the
+ * xattrs stored in the spill block will be cached.
+ *
+ * However, by default SA based xattrs are disabled in the Linux port
+ * to maximize compatibility with other implementations. If you do
+ * enable SA based xattrs then they will not be visible on platforms
+ * which do not support this feature.
+ *
+ * NOTE: One additional consequence of the xattr directory implementation
+ * is that when an extended attribute is manipulated an inode is created.
+ * This inode will exist in the Linux inode cache but there will be no
+ * associated entry in the dentry cache which references it. This is
+ * safe but it may result in some confusion. Enabling SA based xattrs
+ * largely avoids the issue except in the overflow case.
+ */
+
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+
+typedef struct xattr_filldir {
+ size_t size;
+ size_t offset;
+ char *buf;
+ struct dentry *dentry;
+} xattr_filldir_t;
+
+static const struct xattr_handler *zpl_xattr_handler(const char *);
+
+static int
+zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len)
+{
+ static const struct xattr_handler *handler;
+ struct dentry *d = xf->dentry;
+
+ handler = zpl_xattr_handler(name);
+ if (!handler)
+ return (0);
+
+ if (handler->list) {
+#if defined(HAVE_XATTR_LIST_SIMPLE)
+ if (!handler->list(d))
+ return (0);
+#elif defined(HAVE_XATTR_LIST_DENTRY)
+ if (!handler->list(d, NULL, 0, name, name_len, 0))
+ return (0);
+#elif defined(HAVE_XATTR_LIST_HANDLER)
+ if (!handler->list(handler, d, NULL, 0, name, name_len))
+ return (0);
+#elif defined(HAVE_XATTR_LIST_INODE)
+ if (!handler->list(d->d_inode, NULL, 0, name, name_len))
+ return (0);
+#endif
+ }
+
+ return (1);
+}
+
+/*
+ * Determine is a given xattr name should be visible and if so copy it
+ * in to the provided buffer (xf->buf).
+ */
+static int
+zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len)
+{
+ /* Check permissions using the per-namespace list xattr handler. */
+ if (!zpl_xattr_permission(xf, name, name_len))
+ return (0);
+
+ /* When xf->buf is NULL only calculate the required size. */
+ if (xf->buf) {
+ if (xf->offset + name_len + 1 > xf->size)
+ return (-ERANGE);
+
+ memcpy(xf->buf + xf->offset, name, name_len);
+ xf->buf[xf->offset + name_len] = '\0';
+ }
+
+ xf->offset += (name_len + 1);
+
+ return (0);
+}
+
+/*
+ * Read as many directory entry names as will fit in to the provided buffer,
+ * or when no buffer is provided calculate the required buffer size.
+ */
+int
+zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf)
+{
+ zap_cursor_t zc;
+ zap_attribute_t zap;
+ int error;
+
+ zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id);
+
+ while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) {
+
+ if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+ error = -ENXIO;
+ break;
+ }
+
+ error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name));
+ if (error)
+ break;
+
+ zap_cursor_advance(&zc);
+ }
+
+ zap_cursor_fini(&zc);
+
+ if (error == -ENOENT)
+ error = 0;
+
+ return (error);
+}
+
+static ssize_t
+zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr)
+{
+ struct inode *ip = xf->dentry->d_inode;
+ struct inode *dxip = NULL;
+ int error;
+
+ /* Lookup the xattr directory */
+ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL);
+ if (error) {
+ if (error == -ENOENT)
+ error = 0;
+
+ return (error);
+ }
+
+ error = zpl_xattr_readdir(dxip, xf);
+ iput(dxip);
+
+ return (error);
+}
+
+static ssize_t
+zpl_xattr_list_sa(xattr_filldir_t *xf)
+{
+ znode_t *zp = ITOZ(xf->dentry->d_inode);
+ nvpair_t *nvp = NULL;
+ int error = 0;
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = -zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ return (error);
+
+ ASSERT(zp->z_xattr_cached);
+
+ while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
+ ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
+
+ error = zpl_xattr_filldir(xf, nvpair_name(nvp),
+ strlen(nvpair_name(nvp)));
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+ssize_t
+zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+ znode_t *zp = ITOZ(dentry->d_inode);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ xattr_filldir_t xf = { buffer_size, 0, buffer, dentry };
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int error = 0;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ rw_enter(&zp->z_xattr_lock, RW_READER);
+
+ if (zfsvfs->z_use_sa && zp->z_is_sa) {
+ error = zpl_xattr_list_sa(&xf);
+ if (error)
+ goto out;
+ }
+
+ error = zpl_xattr_list_dir(&xf, cr);
+ if (error)
+ goto out;
+
+ error = xf.offset;
+out:
+
+ rw_exit(&zp->z_xattr_lock);
+ ZPL_EXIT(zfsvfs);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (error);
+}
+
+static int
+zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
+ size_t size, cred_t *cr)
+{
+ struct inode *dxip = NULL;
+ struct inode *xip = NULL;
+ loff_t pos = 0;
+ int error;
+
+ /* Lookup the xattr directory */
+ error = -zfs_lookup(ip, NULL, &dxip, LOOKUP_XATTR, cr, NULL, NULL);
+ if (error)
+ goto out;
+
+ /* Lookup a specific xattr name in the directory */
+ error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL);
+ if (error)
+ goto out;
+
+ if (!size) {
+ error = i_size_read(xip);
+ goto out;
+ }
+
+ if (size < i_size_read(xip)) {
+ error = -ERANGE;
+ goto out;
+ }
+
+ error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr);
+out:
+ if (xip)
+ iput(xip);
+
+ if (dxip)
+ iput(dxip);
+
+ return (error);
+}
+
+static int
+zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size)
+{
+ znode_t *zp = ITOZ(ip);
+ uchar_t *nv_value;
+ uint_t nv_size;
+ int error = 0;
+
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = -zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ return (error);
+
+ ASSERT(zp->z_xattr_cached);
+ error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name,
+ &nv_value, &nv_size);
+ if (error)
+ return (error);
+
+ if (size == 0 || value == NULL)
+ return (nv_size);
+
+ if (size < nv_size)
+ return (-ERANGE);
+
+ memcpy(value, nv_value, nv_size);
+
+ return (nv_size);
+}
+
+static int
+__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size,
+ cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+ if (zfsvfs->z_use_sa && zp->z_is_sa) {
+ error = zpl_xattr_get_sa(ip, name, value, size);
+ if (error != -ENOENT)
+ goto out;
+ }
+
+ error = zpl_xattr_get_dir(ip, name, value, size, cr);
+out:
+ if (error == -ENOENT)
+ error = -ENODATA;
+
+ return (error);
+}
+
+#define XATTR_NOENT 0x0
+#define XATTR_IN_SA 0x1
+#define XATTR_IN_DIR 0x2
+/* check where the xattr resides */
+static int
+__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ int error;
+
+ ASSERT(where);
+ ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+ *where = XATTR_NOENT;
+ if (zfsvfs->z_use_sa && zp->z_is_sa) {
+ error = zpl_xattr_get_sa(ip, name, NULL, 0);
+ if (error >= 0)
+ *where |= XATTR_IN_SA;
+ else if (error != -ENOENT)
+ return (error);
+ }
+
+ error = zpl_xattr_get_dir(ip, name, NULL, 0, cr);
+ if (error >= 0)
+ *where |= XATTR_IN_DIR;
+ else if (error != -ENOENT)
+ return (error);
+
+ if (*where == (XATTR_IN_SA|XATTR_IN_DIR))
+ cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\""
+ " in both SA and dir", ip, name);
+ if (*where == XATTR_NOENT)
+ error = -ENODATA;
+ else
+ error = 0;
+ return (error);
+}
+
+static int
+zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ rw_enter(&zp->z_xattr_lock, RW_READER);
+ error = __zpl_xattr_get(ip, name, value, size, cr);
+ rw_exit(&zp->z_xattr_lock);
+ ZPL_EXIT(zfsvfs);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+
+ return (error);
+}
+
+static int
+zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
+ size_t size, int flags, cred_t *cr)
+{
+ struct inode *dxip = NULL;
+ struct inode *xip = NULL;
+ vattr_t *vap = NULL;
+ ssize_t wrote;
+ int lookup_flags, error;
+ const int xattr_mode = S_IFREG | 0644;
+ loff_t pos = 0;
+
+ /*
+ * Lookup the xattr directory. When we're adding an entry pass
+ * CREATE_XATTR_DIR to ensure the xattr directory is created.
+ * When removing an entry this flag is not passed to avoid
+ * unnecessarily creating a new xattr directory.
+ */
+ lookup_flags = LOOKUP_XATTR;
+ if (value != NULL)
+ lookup_flags |= CREATE_XATTR_DIR;
+
+ error = -zfs_lookup(ip, NULL, &dxip, lookup_flags, cr, NULL, NULL);
+ if (error)
+ goto out;
+
+ /* Lookup a specific xattr name in the directory */
+ error = -zfs_lookup(dxip, (char *)name, &xip, 0, cr, NULL, NULL);
+ if (error && (error != -ENOENT))
+ goto out;
+
+ error = 0;
+
+ /* Remove a specific name xattr when value is set to NULL. */
+ if (value == NULL) {
+ if (xip)
+ error = -zfs_remove(dxip, (char *)name, cr, 0);
+
+ goto out;
+ }
+
+ /* Lookup failed create a new xattr. */
+ if (xip == NULL) {
+ vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+ vap->va_mode = xattr_mode;
+ vap->va_mask = ATTR_MODE;
+ vap->va_uid = crgetfsuid(cr);
+ vap->va_gid = crgetfsgid(cr);
+
+ error = -zfs_create(dxip, (char *)name, vap, 0, 0644, &xip,
+ cr, 0, NULL);
+ if (error)
+ goto out;
+ }
+
+ ASSERT(xip != NULL);
+
+ error = -zfs_freesp(ITOZ(xip), 0, 0, xattr_mode, TRUE);
+ if (error)
+ goto out;
+
+ wrote = zpl_write_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr);
+ if (wrote < 0)
+ error = wrote;
+
+out:
+
+ if (error == 0) {
+ ip->i_ctime = current_time(ip);
+ zfs_mark_inode_dirty(ip);
+ }
+
+ if (vap)
+ kmem_free(vap, sizeof (vattr_t));
+
+ if (xip)
+ iput(xip);
+
+ if (dxip)
+ iput(dxip);
+
+ if (error == -ENOENT)
+ error = -ENODATA;
+
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
+ size_t size, int flags, cred_t *cr)
+{
+ znode_t *zp = ITOZ(ip);
+ nvlist_t *nvl;
+ size_t sa_size;
+ int error = 0;
+
+ mutex_enter(&zp->z_lock);
+ if (zp->z_xattr_cached == NULL)
+ error = -zfs_sa_get_xattr(zp);
+ mutex_exit(&zp->z_lock);
+
+ if (error)
+ return (error);
+
+ ASSERT(zp->z_xattr_cached);
+ nvl = zp->z_xattr_cached;
+
+ if (value == NULL) {
+ error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
+ if (error == -ENOENT)
+ error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr);
+ } else {
+ /* Limited to 32k to keep nvpair memory allocations small */
+ if (size > DXATTR_MAX_ENTRY_SIZE)
+ return (-EFBIG);
+
+ /* Prevent the DXATTR SA from consuming the entire SA region */
+ error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
+ if (error)
+ return (error);
+
+ if (sa_size > DXATTR_MAX_SA_SIZE)
+ return (-EFBIG);
+
+ error = -nvlist_add_byte_array(nvl, name,
+ (uchar_t *)value, size);
+ }
+
+ /*
+ * Update the SA for additions, modifications, and removals. On
+ * error drop the inconsistent cached version of the nvlist, it
+ * will be reconstructed from the ARC when next accessed.
+ */
+ if (error == 0)
+ error = -zfs_sa_set_xattr(zp);
+
+ if (error) {
+ nvlist_free(nvl);
+ zp->z_xattr_cached = NULL;
+ }
+
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+static int
+zpl_xattr_set(struct inode *ip, const char *name, const void *value,
+ size_t size, int flags)
+{
+ znode_t *zp = ITOZ(ip);
+ zfsvfs_t *zfsvfs = ZTOZSB(zp);
+ cred_t *cr = CRED();
+ fstrans_cookie_t cookie;
+ int where;
+ int error;
+
+ crhold(cr);
+ cookie = spl_fstrans_mark();
+ ZPL_ENTER(zfsvfs);
+ ZPL_VERIFY_ZP(zp);
+ rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER);
+
+ /*
+ * Before setting the xattr check to see if it already exists.
+ * This is done to ensure the following optional flags are honored.
+ *
+ * XATTR_CREATE: fail if xattr already exists
+ * XATTR_REPLACE: fail if xattr does not exist
+ *
+ * We also want to know if it resides in sa or dir, so we can make
+ * sure we don't end up with duplicate in both places.
+ */
+ error = __zpl_xattr_where(ip, name, &where, cr);
+ if (error < 0) {
+ if (error != -ENODATA)
+ goto out;
+ if (flags & XATTR_REPLACE)
+ goto out;
+
+ /* The xattr to be removed already doesn't exist */
+ error = 0;
+ if (value == NULL)
+ goto out;
+ } else {
+ error = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto out;
+ }
+
+ /* Preferentially store the xattr as a SA for better performance */
+ if (zfsvfs->z_use_sa && zp->z_is_sa &&
+ (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) {
+ error = zpl_xattr_set_sa(ip, name, value, size, flags, cr);
+ if (error == 0) {
+ /*
+ * Successfully put into SA, we need to clear the one
+ * in dir.
+ */
+ if (where & XATTR_IN_DIR)
+ zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr);
+ goto out;
+ }
+ }
+
+ error = zpl_xattr_set_dir(ip, name, value, size, flags, cr);
+ /*
+ * Successfully put into dir, we need to clear the one in SA.
+ */
+ if (error == 0 && (where & XATTR_IN_SA))
+ zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr);
+out:
+ rw_exit(&ITOZ(ip)->z_xattr_lock);
+ ZPL_EXIT(zfsvfs);
+ spl_fstrans_unmark(cookie);
+ crfree(cr);
+ ASSERT3S(error, <=, 0);
+
+ return (error);
+}
+
+/*
+ * Extended user attributes
+ *
+ * "Extended user attributes may be assigned to files and directories for
+ * storing arbitrary additional information such as the mime type,
+ * character set or encoding of a file. The access permissions for user
+ * attributes are defined by the file permission bits: read permission
+ * is required to retrieve the attribute value, and writer permission is
+ * required to change it.
+ *
+ * The file permission bits of regular files and directories are
+ * interpreted differently from the file permission bits of special
+ * files and symbolic links. For regular files and directories the file
+ * permission bits define access to the file's contents, while for
+ * device special files they define access to the device described by
+ * the special file. The file permissions of symbolic links are not
+ * used in access checks. These differences would allow users to
+ * consume filesystem resources in a way not controllable by disk quotas
+ * for group or world writable special files and directories.
+ *
+ * For this reason, extended user attributes are allowed only for
+ * regular files and directories, and access to extended user attributes
+ * is restricted to the owner and to users with appropriate capabilities
+ * for directories with the sticky bit set (see the chmod(1) manual page
+ * for an explanation of the sticky bit)." - xattr(7)
+ *
+ * ZFS allows extended user attributes to be disabled administratively
+ * by setting the 'xattr=off' property on the dataset.
+ */
+static int
+__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return (ITOZSB(ip)->z_flags & ZSB_XATTR);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list);
+
+static int
+__zpl_xattr_user_get(struct inode *ip, const char *name,
+ void *value, size_t size)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+ return (-EOPNOTSUPP);
+
+ xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+ error = zpl_xattr_get(ip, xattr_name, value, size);
+ strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get);
+
+static int
+__zpl_xattr_user_set(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+ return (-EOPNOTSUPP);
+
+ xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+ error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+ strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
+
+xattr_handler_t zpl_xattr_user_handler =
+{
+ .prefix = XATTR_USER_PREFIX,
+ .list = zpl_xattr_user_list,
+ .get = zpl_xattr_user_get,
+ .set = zpl_xattr_user_set,
+};
+
+/*
+ * Trusted extended attributes
+ *
+ * "Trusted extended attributes are visible and accessible only to
+ * processes that have the CAP_SYS_ADMIN capability. Attributes in this
+ * class are used to implement mechanisms in user space (i.e., outside
+ * the kernel) which keep information in extended attributes to which
+ * ordinary processes should not have access." - xattr(7)
+ */
+static int
+__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return (capable(CAP_SYS_ADMIN));
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list);
+
+static int
+__zpl_xattr_trusted_get(struct inode *ip, const char *name,
+ void *value, size_t size)
+{
+ char *xattr_name;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return (-EACCES);
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+ error = zpl_xattr_get(ip, xattr_name, value, size);
+ strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
+
+static int
+__zpl_xattr_trusted_set(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ char *xattr_name;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return (-EACCES);
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+ error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+ strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
+
+xattr_handler_t zpl_xattr_trusted_handler =
+{
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = zpl_xattr_trusted_list,
+ .get = zpl_xattr_trusted_get,
+ .set = zpl_xattr_trusted_set,
+};
+
+/*
+ * Extended security attributes
+ *
+ * "The security attribute namespace is used by kernel security modules,
+ * such as Security Enhanced Linux, and also to implement file
+ * capabilities (see capabilities(7)). Read and write access
+ * permissions to security attributes depend on the policy implemented
+ * for each security attribute by the security module. When no security
+ * module is loaded, all processes have read access to extended security
+ * attributes, and write access is limited to processes that have the
+ * CAP_SYS_ADMIN capability." - xattr(7)
+ */
+static int
+__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ return (1);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list);
+
+static int
+__zpl_xattr_security_get(struct inode *ip, const char *name,
+ void *value, size_t size)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+ error = zpl_xattr_get(ip, xattr_name, value, size);
+ strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
+
+static int
+__zpl_xattr_security_set(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ char *xattr_name;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") == 0)
+ return (-EINVAL);
+#endif
+ xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+ error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+ strfree(xattr_name);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
+
+#ifdef HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY
+static int
+__zpl_xattr_security_init(struct inode *ip, const struct xattr *xattrs,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ int error = 0;
+
+ for (xattr = xattrs; xattr->name != NULL; xattr++) {
+ error = __zpl_xattr_security_set(ip,
+ xattr->name, xattr->value, xattr->value_len, 0);
+
+ if (error < 0)
+ break;
+ }
+
+ return (error);
+}
+
+int
+zpl_xattr_security_init(struct inode *ip, struct inode *dip,
+ const struct qstr *qstr)
+{
+ return security_inode_init_security(ip, dip, qstr,
+ &__zpl_xattr_security_init, NULL);
+}
+
+#else
+int
+zpl_xattr_security_init(struct inode *ip, struct inode *dip,
+ const struct qstr *qstr)
+{
+ int error;
+ size_t len;
+ void *value;
+ char *name;
+
+ error = zpl_security_inode_init_security(ip, dip, qstr,
+ &name, &value, &len);
+ if (error) {
+ if (error == -EOPNOTSUPP)
+ return (0);
+
+ return (error);
+ }
+
+ error = __zpl_xattr_security_set(ip, name, value, len, 0);
+
+ kfree(name);
+ kfree(value);
+
+ return (error);
+}
+#endif /* HAVE_CALLBACK_SECURITY_INODE_INIT_SECURITY */
+
+/*
+ * Security xattr namespace handlers.
+ */
+xattr_handler_t zpl_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = zpl_xattr_security_list,
+ .get = zpl_xattr_security_get,
+ .set = zpl_xattr_security_set,
+};
+
+/*
+ * Extended system attributes
+ *
+ * "Extended system attributes are used by the kernel to store system
+ * objects such as Access Control Lists. Read and write access permissions
+ * to system attributes depend on the policy implemented for each system
+ * attribute implemented by filesystems in the kernel." - xattr(7)
+ */
+#ifdef CONFIG_FS_POSIX_ACL
+int
+zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
+{
+ char *name, *value = NULL;
+ int error = 0;
+ size_t size = 0;
+
+ if (S_ISLNK(ip->i_mode))
+ return (-EOPNOTSUPP);
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ if (acl) {
+ zpl_equivmode_t mode = ip->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0) {
+ return (error);
+ } else {
+ /*
+ * The mode bits will have been set by
+ * ->zfs_setattr()->zfs_acl_chmod_setattr()
+ * using the ZFS ACL conversion. If they
+ * differ from the Posix ACL conversion dirty
+ * the inode to write the Posix mode bits.
+ */
+ if (ip->i_mode != mode) {
+ ip->i_mode = mode;
+ ip->i_ctime = current_time(ip);
+ zfs_mark_inode_dirty(ip);
+ }
+
+ if (error == 0)
+ acl = NULL;
+ }
+ }
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(ip->i_mode))
+ return (acl ? -EACCES : 0);
+ break;
+
+ default:
+ return (-EINVAL);
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmem_alloc(size, KM_SLEEP);
+
+ error = zpl_acl_to_xattr(acl, value, size);
+ if (error < 0) {
+ kmem_free(value, size);
+ return (error);
+ }
+ }
+
+ error = zpl_xattr_set(ip, name, value, size, 0);
+ if (value)
+ kmem_free(value, size);
+
+ if (!error) {
+ if (acl)
+ zpl_set_cached_acl(ip, type, acl);
+ else
+ zpl_forget_cached_acl(ip, type);
+ }
+
+ return (error);
+}
+
+struct posix_acl *
+zpl_get_acl(struct inode *ip, int type)
+{
+ struct posix_acl *acl;
+ void *value = NULL;
+ char *name;
+ int size;
+
+ /*
+ * As of Linux 3.14, the kernel get_acl will check this for us.
+ * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong
+ * as the kernel get_acl will set it to temporary sentinel value.
+ */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+ acl = get_cached_acl(ip, type);
+ if (acl != ACL_NOT_CACHED)
+ return (acl);
+#endif
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return (ERR_PTR(-EINVAL));
+ }
+
+ size = zpl_xattr_get(ip, name, NULL, 0);
+ if (size > 0) {
+ value = kmem_alloc(size, KM_SLEEP);
+ size = zpl_xattr_get(ip, name, value, size);
+ }
+
+ if (size > 0) {
+ acl = zpl_acl_from_xattr(value, size);
+ } else if (size == -ENODATA || size == -ENOSYS) {
+ acl = NULL;
+ } else {
+ acl = ERR_PTR(-EIO);
+ }
+
+ if (size > 0)
+ kmem_free(value, size);
+
+ /* As of Linux 4.7, the kernel get_acl will set this for us */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+ if (!IS_ERR(acl))
+ zpl_set_cached_acl(ip, type, acl);
+#endif
+
+ return (acl);
+}
+
+#if !defined(HAVE_GET_ACL)
+static int
+__zpl_check_acl(struct inode *ip, int mask)
+{
+ struct posix_acl *acl;
+ int error;
+
+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+
+ if (acl) {
+ error = posix_acl_permission(ip, acl, mask);
+ zpl_posix_acl_release(acl);
+ return (error);
+ }
+
+ return (-EAGAIN);
+}
+
+#if defined(HAVE_CHECK_ACL_WITH_FLAGS)
+int
+zpl_check_acl(struct inode *ip, int mask, unsigned int flags)
+{
+ return (__zpl_check_acl(ip, mask));
+}
+#elif defined(HAVE_CHECK_ACL)
+int
+zpl_check_acl(struct inode *ip, int mask)
+{
+ return (__zpl_check_acl(ip, mask));
+}
+#elif defined(HAVE_PERMISSION_WITH_NAMEIDATA)
+int
+zpl_permission(struct inode *ip, int mask, struct nameidata *nd)
+{
+ return (generic_permission(ip, mask, __zpl_check_acl));
+}
+#elif defined(HAVE_PERMISSION)
+int
+zpl_permission(struct inode *ip, int mask)
+{
+ return (generic_permission(ip, mask, __zpl_check_acl));
+}
+#endif /* HAVE_CHECK_ACL | HAVE_PERMISSION */
+#endif /* !HAVE_GET_ACL */
+
+int
+zpl_init_acl(struct inode *ip, struct inode *dir)
+{
+ struct posix_acl *acl = NULL;
+ int error = 0;
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (0);
+
+ if (!S_ISLNK(ip->i_mode)) {
+ acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ if (!acl) {
+ ip->i_mode &= ~current_umask();
+ ip->i_ctime = current_time(ip);
+ zfs_mark_inode_dirty(ip);
+ return (0);
+ }
+ }
+
+ if (acl) {
+ umode_t mode;
+
+ if (S_ISDIR(ip->i_mode)) {
+ error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT);
+ if (error)
+ goto out;
+ }
+
+ mode = ip->i_mode;
+ error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
+ if (error >= 0) {
+ ip->i_mode = mode;
+ zfs_mark_inode_dirty(ip);
+ if (error > 0)
+ error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+ }
+ }
+out:
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+
+int
+zpl_chmod_acl(struct inode *ip)
+{
+ struct posix_acl *acl;
+ int error;
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (0);
+
+ if (S_ISLNK(ip->i_mode))
+ return (-EOPNOTSUPP);
+
+ acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return (PTR_ERR(acl));
+
+ error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode);
+ if (!error)
+ error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+
+static int
+__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
+ size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS);
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (0);
+
+ if (list && xattr_size <= list_size)
+ memcpy(list, xattr_name, xattr_size);
+
+ return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access);
+
+static int
+__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size,
+ const char *name, size_t name_len)
+{
+ char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT);
+
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (0);
+
+ if (list && xattr_size <= list_size)
+ memcpy(list, xattr_name, xattr_size);
+
+ return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default);
+
+static int
+__zpl_xattr_acl_get_access(struct inode *ip, const char *name,
+ void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_ACCESS;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (-EOPNOTSUPP);
+
+ acl = zpl_get_acl(ip, type);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ if (acl == NULL)
+ return (-ENODATA);
+
+ error = zpl_acl_to_xattr(acl, buffer, size);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access);
+
+static int
+__zpl_xattr_acl_get_default(struct inode *ip, const char *name,
+ void *buffer, size_t size)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_DEFAULT;
+ int error;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (-EOPNOTSUPP);
+
+ acl = zpl_get_acl(ip, type);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ if (acl == NULL)
+ return (-ENODATA);
+
+ error = zpl_acl_to_xattr(acl, buffer, size);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default);
+
+static int
+__zpl_xattr_acl_set_access(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_ACCESS;
+ int error = 0;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (-EOPNOTSUPP);
+
+ if (!zpl_inode_owner_or_capable(ip))
+ return (-EPERM);
+
+ if (value) {
+ acl = zpl_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ else if (acl) {
+ error = zpl_posix_acl_valid(ip, acl);
+ if (error) {
+ zpl_posix_acl_release(acl);
+ return (error);
+ }
+ }
+ } else {
+ acl = NULL;
+ }
+
+ error = zpl_set_acl(ip, acl, type);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access);
+
+static int
+__zpl_xattr_acl_set_default(struct inode *ip, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct posix_acl *acl;
+ int type = ACL_TYPE_DEFAULT;
+ int error = 0;
+ /* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+ if (strcmp(name, "") != 0)
+ return (-EINVAL);
+#endif
+ if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL)
+ return (-EOPNOTSUPP);
+
+ if (!zpl_inode_owner_or_capable(ip))
+ return (-EPERM);
+
+ if (value) {
+ acl = zpl_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return (PTR_ERR(acl));
+ else if (acl) {
+ error = zpl_posix_acl_valid(ip, acl);
+ if (error) {
+ zpl_posix_acl_release(acl);
+ return (error);
+ }
+ }
+ } else {
+ acl = NULL;
+ }
+
+ error = zpl_set_acl(ip, acl, type);
+ zpl_posix_acl_release(acl);
+
+ return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default);
+
+/*
+ * ACL access xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_access_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+#else
+ .prefix = XATTR_NAME_POSIX_ACL_ACCESS,
+#endif
+ .list = zpl_xattr_acl_list_access,
+ .get = zpl_xattr_acl_get_access,
+ .set = zpl_xattr_acl_set_access,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+ defined(HAVE_XATTR_LIST_DENTRY) || \
+ defined(HAVE_XATTR_LIST_HANDLER)
+ .flags = ACL_TYPE_ACCESS,
+#endif
+};
+
+/*
+ * ACL default xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_default_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+#else
+ .prefix = XATTR_NAME_POSIX_ACL_DEFAULT,
+#endif
+ .list = zpl_xattr_acl_list_default,
+ .get = zpl_xattr_acl_get_default,
+ .set = zpl_xattr_acl_set_default,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+ defined(HAVE_XATTR_LIST_DENTRY) || \
+ defined(HAVE_XATTR_LIST_HANDLER)
+ .flags = ACL_TYPE_DEFAULT,
+#endif
+};
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+xattr_handler_t *zpl_xattr_handlers[] = {
+ &zpl_xattr_security_handler,
+ &zpl_xattr_trusted_handler,
+ &zpl_xattr_user_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+ &zpl_xattr_acl_access_handler,
+ &zpl_xattr_acl_default_handler,
+#endif /* CONFIG_FS_POSIX_ACL */
+ NULL
+};
+
+static const struct xattr_handler *
+zpl_xattr_handler(const char *name)
+{
+ if (strncmp(name, XATTR_USER_PREFIX,
+ XATTR_USER_PREFIX_LEN) == 0)
+ return (&zpl_xattr_user_handler);
+
+ if (strncmp(name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN) == 0)
+ return (&zpl_xattr_trusted_handler);
+
+ if (strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) == 0)
+ return (&zpl_xattr_security_handler);
+
+#ifdef CONFIG_FS_POSIX_ACL
+ if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+ sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0)
+ return (&zpl_xattr_acl_access_handler);
+
+ if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+ sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0)
+ return (&zpl_xattr_acl_default_handler);
+#endif /* CONFIG_FS_POSIX_ACL */
+
+ return (NULL);
+}
+
+#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)
+struct acl_rel_struct {
+ struct acl_rel_struct *next;
+ struct posix_acl *acl;
+ clock_t time;
+};
+
+#define ACL_REL_GRACE (60*HZ)
+#define ACL_REL_WINDOW (1*HZ)
+#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW)
+
+/*
+ * Lockless multi-producer single-consumer fifo list.
+ * Nodes are added to tail and removed from head. Tail pointer is our
+ * synchronization point. It always points to the next pointer of the last
+ * node, or head if list is empty.
+ */
+static struct acl_rel_struct *acl_rel_head = NULL;
+static struct acl_rel_struct **acl_rel_tail = &acl_rel_head;
+
+static void
+zpl_posix_acl_free(void *arg)
+{
+ struct acl_rel_struct *freelist = NULL;
+ struct acl_rel_struct *a;
+ clock_t new_time;
+ boolean_t refire = B_FALSE;
+
+ ASSERT3P(acl_rel_head, !=, NULL);
+ while (acl_rel_head) {
+ a = acl_rel_head;
+ if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) {
+ /*
+ * If a is the last node we need to reset tail, but we
+ * need to use cmpxchg to make sure it is still the
+ * last node.
+ */
+ if (acl_rel_tail == &a->next) {
+ acl_rel_head = NULL;
+ if (cmpxchg(&acl_rel_tail, &a->next,
+ &acl_rel_head) == &a->next) {
+ ASSERT3P(a->next, ==, NULL);
+ a->next = freelist;
+ freelist = a;
+ break;
+ }
+ }
+ /*
+ * a is not last node, make sure next pointer is set
+ * by the adder and advance the head.
+ */
+ while (READ_ONCE(a->next) == NULL)
+ cpu_relax();
+ acl_rel_head = a->next;
+ a->next = freelist;
+ freelist = a;
+ } else {
+ /*
+ * a is still in grace period. We are responsible to
+ * reschedule the free task, since adder will only do
+ * so if list is empty.
+ */
+ new_time = a->time + ACL_REL_SCHED;
+ refire = B_TRUE;
+ break;
+ }
+ }
+
+ if (refire)
+ taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+ NULL, TQ_SLEEP, new_time);
+
+ while (freelist) {
+ a = freelist;
+ freelist = a->next;
+ kfree(a->acl);
+ kmem_free(a, sizeof (struct acl_rel_struct));
+ }
+}
+
+void
+zpl_posix_acl_release_impl(struct posix_acl *acl)
+{
+ struct acl_rel_struct *a, **prev;
+
+ a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP);
+ a->next = NULL;
+ a->acl = acl;
+ a->time = ddi_get_lbolt();
+ /* atomically points tail to us and get the previous tail */
+ prev = xchg(&acl_rel_tail, &a->next);
+ ASSERT3P(*prev, ==, NULL);
+ *prev = a;
+ /* if it was empty before, schedule the free task */
+ if (prev == &acl_rel_head)
+ taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+ NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED);
+}
+#endif